diff --git a/.gitignore b/.gitignore index 3a1c21c..1a51aaa 100644 --- a/.gitignore +++ b/.gitignore @@ -50,9 +50,14 @@ MANIFEST .coverage htmlcov/ parsed_output/ +*.ndjson +*.ndjson.gz +*.json !tests/example_output/ -!tests/example_output/parsed_output/ -!tests/example_output/logs/ +!tests/example_output/logs/** +!tests/example_output/parsed_output/** +!tests/example_output/logs_complex/** +!tests/example_output/parsed_output_complex/** .tox/ .nox/ .hypothesis/ @@ -63,4 +68,5 @@ ENV/ env.bak/ venv.bak/ + # end diff --git a/tests/example_output/logs_complex/dedicated_log_triton_trace_findhao_.ndjson b/tests/example_output/logs_complex/dedicated_log_triton_trace_findhao_.ndjson new file mode 100644 index 0000000..752bbe0 --- /dev/null +++ b/tests/example_output/logs_complex/dedicated_log_triton_trace_findhao_.ndjson @@ -0,0 +1,1562 @@ +{"event_type":"compilation","pid":171439,"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":149,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":593,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel = self._do_compile(key, signature, device, constexprs, options, attrs, warmup)"},{"line":773,"name":"_do_compile","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel = self.compile(src, target=target, options=options.__dict__)"},{"line":267,"name":"compile","filename":"/scratch/findhao/pta/triton/python/triton/compiler/compiler.py","loc":"compilation_listener("},{"line":752,"name":"maybe_trace_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton("},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ","payload":{"metadata":{"cache_hit":true,"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32,"env":{},"src_attrs":{"(0,)":[["tt.divisibility",16]],"(1,)":[["tt.divisibility",16]],"(2,)":[["tt.divisibility",16]],"(3,)":[["tt.divisibility",16]],"(4,)":[["tt.divisibility",16]],"(5,)":[["tt.divisibility",16]],"(6,)":[["tt.divisibility",16]],"(8,)":[["tt.divisibility",16]],"(10,)":[["tt.divisibility",16]]},"src_cache_key":"5aec8bef23533ced7a4a2dea17fb314b1446b68a9ca72aa80e32caf75b768172","src_constants":{"(7,)":1,"(9,)":1,"(11,)":1,"(12,)":16,"(13,)":16,"(14,)":16,"(15,)":1}},"file_path":{"matmul_kernel.source":"/home/findhao/.triton/cache/OLLHQQSAFCG5SL4XZW5JXT24TZODOVBB4YT3NHJ4XTOJNY4ILMAA/matmul_kernel.source","matmul_kernel.ttir":"/home/findhao/.triton/cache/OLLHQQSAFCG5SL4XZW5JXT24TZODOVBB4YT3NHJ4XTOJNY4ILMAA/matmul_kernel.ttir","matmul_kernel.ttgir":"/home/findhao/.triton/cache/OLLHQQSAFCG5SL4XZW5JXT24TZODOVBB4YT3NHJ4XTOJNY4ILMAA/matmul_kernel.ttgir","matmul_kernel.llir":"/home/findhao/.triton/cache/OLLHQQSAFCG5SL4XZW5JXT24TZODOVBB4YT3NHJ4XTOJNY4ILMAA/matmul_kernel.llir","matmul_kernel.ptx":"/home/findhao/.triton/cache/OLLHQQSAFCG5SL4XZW5JXT24TZODOVBB4YT3NHJ4XTOJNY4ILMAA/matmul_kernel.ptx","matmul_kernel.cubin":"/home/findhao/.triton/cache/OLLHQQSAFCG5SL4XZW5JXT24TZODOVBB4YT3NHJ4XTOJNY4ILMAA/matmul_kernel.cubin","matmul_kernel.json":"/home/findhao/.triton/cache/OLLHQQSAFCG5SL4XZW5JXT24TZODOVBB4YT3NHJ4XTOJNY4ILMAA/matmul_kernel.json"},"file_content":{"matmul_kernel.ttir":"#loc = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0)\nmodule {\n tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg1: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg2: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg3: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg4: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg5: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg6: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg7: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg8: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0)) attributes {noinline = false} {\n %c15_i32 = arith.constant 15 : i32 loc(#loc1)\n %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16> loc(#loc1)\n %c0_i32 = arith.constant 0 : i32 loc(#loc1)\n %cst_0 = arith.constant dense<16> : tensor<16x16xi32> loc(#loc1)\n %cst_1 = arith.constant dense<0.000000e+00> : tensor<16x16xf32> loc(#loc1)\n %c16_i32 = arith.constant 16 : i32 loc(#loc1)\n %c1_i32 = arith.constant 1 : i32 loc(#loc1)\n %0 = tt.get_program_id x : i32 loc(#loc2)\n %1 = arith.addi %arg3, %c15_i32 : i32 loc(#loc55)\n %2 = arith.divsi %1, %c16_i32 : i32 loc(#loc56)\n %3 = arith.addi %arg4, %c15_i32 : i32 loc(#loc57)\n %4 = arith.divsi %3, %c16_i32 : i32 loc(#loc58)\n %5 = arith.divsi %0, %4 : i32 loc(#loc7)\n %6 = arith.subi %2, %5 : i32 loc(#loc8)\n %7 = arith.minsi %6, %c1_i32 : i32 loc(#loc9)\n %8 = arith.remsi %0, %7 : i32 loc(#loc10)\n %9 = arith.addi %5, %8 : i32 loc(#loc11)\n %10 = arith.remsi %0, %4 : i32 loc(#loc12)\n %11 = arith.divsi %10, %7 : i32 loc(#loc13)\n %12 = arith.muli %9, %c16_i32 : i32 loc(#loc14)\n %13 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc15)\n %14 = tt.splat %12 : i32 -> tensor<16xi32> loc(#loc16)\n %15 = arith.addi %14, %13 : tensor<16xi32> loc(#loc16)\n %16 = tt.splat %arg3 : i32 -> tensor<16xi32> loc(#loc17)\n %17 = arith.remsi %15, %16 : tensor<16xi32> loc(#loc17)\n %18 = arith.muli %11, %c16_i32 : i32 loc(#loc18)\n %19 = tt.splat %18 : i32 -> tensor<16xi32> loc(#loc19)\n %20 = arith.addi %19, %13 : tensor<16xi32> loc(#loc19)\n %21 = tt.splat %arg4 : i32 -> tensor<16xi32> loc(#loc20)\n %22 = arith.remsi %20, %21 : tensor<16xi32> loc(#loc20)\n %23 = tt.expand_dims %17 {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc21)\n %24 = tt.splat %arg6 : i32 -> tensor<16x1xi32> loc(#loc22)\n %25 = arith.muli %23, %24 : tensor<16x1xi32> loc(#loc22)\n %26 = tt.expand_dims %13 {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc23)\n %27 = tt.broadcast %25 : tensor<16x1xi32> -> tensor<16x16xi32> loc(#loc24)\n %28 = tt.broadcast %26 : tensor<1x16xi32> -> tensor<16x16xi32> loc(#loc24)\n %29 = arith.addi %27, %28 : tensor<16x16xi32> loc(#loc24)\n %30 = tt.splat %arg0 : !tt.ptr -> tensor<16x16x!tt.ptr> loc(#loc25)\n %31 = tt.addptr %30, %29 : tensor<16x16x!tt.ptr>, tensor<16x16xi32> loc(#loc25)\n %32 = tt.expand_dims %13 {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc26)\n %33 = tt.splat %arg7 : i32 -> tensor<16x1xi32> loc(#loc27)\n %34 = arith.muli %32, %33 : tensor<16x1xi32> loc(#loc27)\n %35 = tt.expand_dims %22 {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc28)\n %36 = tt.broadcast %34 : tensor<16x1xi32> -> tensor<16x16xi32> loc(#loc29)\n %37 = tt.broadcast %35 : tensor<1x16xi32> -> tensor<16x16xi32> loc(#loc29)\n %38 = arith.addi %36, %37 : tensor<16x16xi32> loc(#loc29)\n %39 = tt.splat %arg1 : !tt.ptr -> tensor<16x16x!tt.ptr> loc(#loc30)\n %40 = tt.addptr %39, %38 : tensor<16x16x!tt.ptr>, tensor<16x16xi32> loc(#loc30)\n %41 = arith.addi %arg5, %c15_i32 : i32 loc(#loc59)\n %42 = arith.divsi %41, %c16_i32 : i32 loc(#loc60)\n %43:3 = scf.for %arg9 = %c0_i32 to %42 step %c1_i32 iter_args(%arg10 = %31, %arg11 = %40, %arg12 = %cst_1) -> (tensor<16x16x!tt.ptr>, tensor<16x16x!tt.ptr>, tensor<16x16xf32>) : i32 {\n %61 = arith.muli %arg9, %c16_i32 : i32 loc(#loc33)\n %62 = arith.subi %arg5, %61 : i32 loc(#loc34)\n %63 = tt.splat %62 : i32 -> tensor<1x16xi32> loc(#loc35)\n %64 = arith.cmpi slt, %26, %63 : tensor<1x16xi32> loc(#loc35)\n %65 = tt.broadcast %64 : tensor<1x16xi1> -> tensor<16x16xi1> loc(#loc36)\n %66 = tt.load %arg10, %65, %cst : tensor<16x16x!tt.ptr> loc(#loc36)\n %67 = tt.splat %62 : i32 -> tensor<16x1xi32> loc(#loc37)\n %68 = arith.cmpi slt, %32, %67 : tensor<16x1xi32> loc(#loc37)\n %69 = tt.broadcast %68 : tensor<16x1xi1> -> tensor<16x16xi1> loc(#loc38)\n %70 = tt.load %arg11, %69, %cst : tensor<16x16x!tt.ptr> loc(#loc38)\n %71 = tt.dot %66, %70, %arg12, inputPrecision = tf32 : tensor<16x16xf16> * tensor<16x16xf16> -> tensor<16x16xf32> loc(#loc39)\n %72 = tt.addptr %arg10, %cst_0 : tensor<16x16x!tt.ptr>, tensor<16x16xi32> loc(#loc40)\n %73 = arith.muli %arg7, %c16_i32 : i32 loc(#loc41)\n %74 = tt.splat %73 : i32 -> tensor<16x16xi32> loc(#loc42)\n %75 = tt.addptr %arg11, %74 : tensor<16x16x!tt.ptr>, tensor<16x16xi32> loc(#loc42)\n scf.yield %72, %75, %71 : tensor<16x16x!tt.ptr>, tensor<16x16x!tt.ptr>, tensor<16x16xf32> loc(#loc43)\n } loc(#loc32)\n %44 = arith.truncf %43#2 : tensor<16x16xf32> to tensor<16x16xf16> loc(#loc44)\n %45 = tt.expand_dims %15 {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc45)\n %46 = tt.splat %arg8 : i32 -> tensor<16x1xi32> loc(#loc46)\n %47 = arith.muli %46, %45 : tensor<16x1xi32> loc(#loc46)\n %48 = tt.splat %arg2 : !tt.ptr -> tensor<16x1x!tt.ptr> loc(#loc47)\n %49 = tt.addptr %48, %47 : tensor<16x1x!tt.ptr>, tensor<16x1xi32> loc(#loc47)\n %50 = tt.expand_dims %20 {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc48)\n %51 = tt.broadcast %49 : tensor<16x1x!tt.ptr> -> tensor<16x16x!tt.ptr> loc(#loc49)\n %52 = tt.broadcast %50 : tensor<1x16xi32> -> tensor<16x16xi32> loc(#loc49)\n %53 = tt.addptr %51, %52 : tensor<16x16x!tt.ptr>, tensor<16x16xi32> loc(#loc49)\n %54 = tt.splat %arg3 : i32 -> tensor<16x1xi32> loc(#loc50)\n %55 = arith.cmpi slt, %45, %54 : tensor<16x1xi32> loc(#loc50)\n %56 = tt.splat %arg4 : i32 -> tensor<1x16xi32> loc(#loc51)\n %57 = arith.cmpi slt, %50, %56 : tensor<1x16xi32> loc(#loc51)\n %58 = tt.broadcast %55 : tensor<16x1xi1> -> tensor<16x16xi1> loc(#loc52)\n %59 = tt.broadcast %57 : tensor<1x16xi1> -> tensor<16x16xi1> loc(#loc52)\n %60 = arith.andi %58, %59 : tensor<16x16xi1> loc(#loc52)\n tt.store %53, %44, %60 : tensor<16x16x!tt.ptr> loc(#loc53)\n tt.return loc(#loc54)\n } loc(#loc)\n} loc(#loc)\n#loc1 = loc(unknown)\n#loc2 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":47:24)\n#loc3 = loc(\"/scratch/findhao/pta/triton/python/triton/language/standard.py\":40:22)\n#loc4 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":48:27)\n#loc5 = loc(\"/scratch/findhao/pta/triton/python/triton/language/standard.py\":40:28)\n#loc6 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":49:27)\n#loc7 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":51:22)\n#loc8 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":53:33)\n#loc9 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":53:46)\n#loc10 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":54:33)\n#loc11 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":54:27)\n#loc12 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":55:19)\n#loc13 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":55:40)\n#loc14 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:23)\n#loc15 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:51)\n#loc16 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:38)\n#loc17 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:68)\n#loc18 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:23)\n#loc19 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:38)\n#loc20 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:68)\n#loc21 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:30)\n#loc22 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:41)\n#loc23 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:60)\n#loc24 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:53)\n#loc25 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:22)\n#loc26 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:29)\n#loc27 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:40)\n#loc28 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:60)\n#loc29 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:52)\n#loc30 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:22)\n#loc31 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":64:33)\n#loc32 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":64:22)\n#loc33 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:59)\n#loc34 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:55)\n#loc35 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:51)\n#loc36 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:20)\n#loc37 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":66:51)\n#loc38 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":66:20)\n#loc39 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":67:33)\n#loc40 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":68:18)\n#loc41 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:33)\n#loc42 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:18)\n#loc43 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:8)\n#loc44 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":70:23)\n#loc45 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:41)\n#loc46 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:33)\n#loc47 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:21)\n#loc48 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:72)\n#loc49 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:52)\n#loc50 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:33)\n#loc51 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:58)\n#loc52 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:39)\n#loc53 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":76:21)\n#loc54 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":76:4)\n#loc55 = loc(callsite(#loc3 at #loc4))\n#loc56 = loc(callsite(#loc5 at #loc4))\n#loc57 = loc(callsite(#loc3 at #loc6))\n#loc58 = loc(callsite(#loc5 at #loc6))\n#loc59 = loc(callsite(#loc3 at #loc31))\n#loc60 = loc(callsite(#loc5 at #loc31))\n","matmul_kernel.ttgir":"#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [16, 2], warpsPerCTA = [1, 1], order = [1, 0]}>\n#blocked1 = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [4, 8], warpsPerCTA = [1, 1], order = [1, 0]}>\n#loc = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0)\n#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>\n#smem = #ttg.shared_memory\nmodule attributes {\"ttg.num-ctas\" = 1 : i32, \"ttg.num-warps\" = 1 : i32, ttg.target = \"cuda:75\", \"ttg.threads-per-warp\" = 32 : i32} {\n tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg1: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg2: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg3: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg4: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg5: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg6: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg7: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg8: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0)) attributes {noinline = false} {\n %cst = arith.constant dense<16> : tensor<16x16xi32, #blocked> loc(#loc1)\n %c0_i32 = arith.constant 0 : i32 loc(#loc1)\n %c15_i32 = arith.constant 15 : i32 loc(#loc1)\n %c16_i32 = arith.constant 16 : i32 loc(#loc1)\n %c1_i32 = arith.constant 1 : i32 loc(#loc1)\n %cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #blocked> loc(#loc1)\n %cst_1 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #blocked1> loc(#loc1)\n %0 = tt.get_program_id x : i32 loc(#loc2)\n %1 = arith.addi %arg3, %c15_i32 : i32 loc(#loc55)\n %2 = arith.divsi %1, %c16_i32 : i32 loc(#loc56)\n %3 = arith.addi %arg4, %c15_i32 : i32 loc(#loc57)\n %4 = arith.divsi %3, %c16_i32 : i32 loc(#loc58)\n %5 = arith.divsi %0, %4 : i32 loc(#loc7)\n %6 = arith.subi %2, %5 : i32 loc(#loc8)\n %7 = arith.minsi %6, %c1_i32 : i32 loc(#loc9)\n %8 = arith.remsi %0, %7 : i32 loc(#loc10)\n %9 = arith.addi %5, %8 : i32 loc(#loc11)\n %10 = arith.remsi %0, %4 : i32 loc(#loc12)\n %11 = arith.divsi %10, %7 : i32 loc(#loc13)\n %12 = arith.muli %9, %c16_i32 : i32 loc(#loc14)\n %13 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc15)\n %14 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc15)\n %15 = tt.splat %12 : i32 -> tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc16)\n %16 = arith.addi %15, %13 : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc16)\n %17 = tt.splat %arg3 : i32 -> tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc17)\n %18 = arith.remsi %16, %17 : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc17)\n %19 = arith.muli %11, %c16_i32 : i32 loc(#loc18)\n %20 = tt.splat %19 : i32 -> tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc19)\n %21 = arith.addi %20, %14 : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc19)\n %22 = tt.splat %arg4 : i32 -> tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc20)\n %23 = arith.remsi %21, %22 : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc20)\n %24 = tt.expand_dims %18 {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> loc(#loc21)\n %25 = tt.splat %arg6 : i32 -> tensor<16x1xi32, #blocked> loc(#loc22)\n %26 = arith.muli %24, %25 : tensor<16x1xi32, #blocked> loc(#loc22)\n %27 = tt.expand_dims %14 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc23)\n %28 = tt.broadcast %26 : tensor<16x1xi32, #blocked> -> tensor<16x16xi32, #blocked> loc(#loc24)\n %29 = tt.broadcast %27 : tensor<1x16xi32, #blocked> -> tensor<16x16xi32, #blocked> loc(#loc24)\n %30 = arith.addi %28, %29 : tensor<16x16xi32, #blocked> loc(#loc24)\n %31 = tt.splat %arg0 : !tt.ptr -> tensor<16x16x!tt.ptr, #blocked> loc(#loc25)\n %32 = tt.addptr %31, %30 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> loc(#loc25)\n %33 = tt.expand_dims %13 {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> loc(#loc26)\n %34 = tt.splat %arg7 : i32 -> tensor<16x1xi32, #blocked> loc(#loc27)\n %35 = arith.muli %33, %34 : tensor<16x1xi32, #blocked> loc(#loc27)\n %36 = tt.expand_dims %23 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc28)\n %37 = tt.broadcast %35 : tensor<16x1xi32, #blocked> -> tensor<16x16xi32, #blocked> loc(#loc29)\n %38 = tt.broadcast %36 : tensor<1x16xi32, #blocked> -> tensor<16x16xi32, #blocked> loc(#loc29)\n %39 = arith.addi %37, %38 : tensor<16x16xi32, #blocked> loc(#loc29)\n %40 = tt.splat %arg1 : !tt.ptr -> tensor<16x16x!tt.ptr, #blocked> loc(#loc30)\n %41 = tt.addptr %40, %39 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> loc(#loc30)\n %42 = arith.addi %arg5, %c15_i32 : i32 loc(#loc59)\n %43 = arith.divsi %42, %c16_i32 : i32 loc(#loc60)\n %44 = arith.muli %arg7, %c16_i32 : i32 loc(#loc32)\n %45 = tt.splat %44 : i32 -> tensor<16x16xi32, #blocked> loc(#loc33)\n %46:3 = scf.for %arg9 = %c0_i32 to %43 step %c1_i32 iter_args(%arg10 = %cst_1, %arg11 = %32, %arg12 = %41) -> (tensor<16x16xf32, #blocked1>, tensor<16x16x!tt.ptr, #blocked>, tensor<16x16x!tt.ptr, #blocked>) : i32 {\n %65 = arith.muli %arg9, %c16_i32 : i32 loc(#loc35)\n %66 = arith.subi %arg5, %65 : i32 loc(#loc36)\n %67 = tt.splat %66 : i32 -> tensor<1x16xi32, #blocked> loc(#loc37)\n %68 = arith.cmpi slt, %27, %67 : tensor<1x16xi32, #blocked> loc(#loc37)\n %69 = tt.broadcast %68 : tensor<1x16xi1, #blocked> -> tensor<16x16xi1, #blocked> loc(#loc38)\n %70 = tt.load %arg11, %69, %cst_0 : tensor<16x16x!tt.ptr, #blocked> loc(#loc38)\n %71 = tt.splat %66 : i32 -> tensor<16x1xi32, #blocked> loc(#loc39)\n %72 = arith.cmpi slt, %33, %71 : tensor<16x1xi32, #blocked> loc(#loc39)\n %73 = tt.broadcast %72 : tensor<16x1xi1, #blocked> -> tensor<16x16xi1, #blocked> loc(#loc40)\n %74 = tt.load %arg12, %73, %cst_0 : tensor<16x16x!tt.ptr, #blocked> loc(#loc40)\n %75 = arith.extf %70 : tensor<16x16xf16, #blocked> to tensor<16x16xf32, #blocked> loc(#loc41)\n %76 = ttg.local_alloc %75 : (tensor<16x16xf32, #blocked>) -> !ttg.memdesc<16x16xf32, #shared, #smem> loc(#loc41)\n %77 = ttg.local_load %76 : !ttg.memdesc<16x16xf32, #shared, #smem> -> tensor<16x16xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked1}>> loc(#loc41)\n %78 = arith.extf %74 : tensor<16x16xf16, #blocked> to tensor<16x16xf32, #blocked> loc(#loc41)\n %79 = ttg.local_alloc %78 : (tensor<16x16xf32, #blocked>) -> !ttg.memdesc<16x16xf32, #shared, #smem> loc(#loc41)\n %80 = ttg.local_load %79 : !ttg.memdesc<16x16xf32, #shared, #smem> -> tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked1}>> loc(#loc41)\n %81 = tt.dot %77, %80, %arg10, inputPrecision = tf32 : tensor<16x16xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked1}>> * tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked1}>> -> tensor<16x16xf32, #blocked1> loc(#loc41)\n %82 = tt.addptr %arg11, %cst : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> loc(#loc42)\n %83 = tt.addptr %arg12, %45 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> loc(#loc33)\n scf.yield %81, %82, %83 : tensor<16x16xf32, #blocked1>, tensor<16x16x!tt.ptr, #blocked>, tensor<16x16x!tt.ptr, #blocked> loc(#loc43)\n } loc(#loc34)\n %47 = arith.truncf %46#0 : tensor<16x16xf32, #blocked1> to tensor<16x16xf16, #blocked1> loc(#loc44)\n %48 = tt.expand_dims %16 {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> loc(#loc45)\n %49 = tt.splat %arg8 : i32 -> tensor<16x1xi32, #blocked> loc(#loc46)\n %50 = arith.muli %49, %48 : tensor<16x1xi32, #blocked> loc(#loc46)\n %51 = tt.splat %arg2 : !tt.ptr -> tensor<16x1x!tt.ptr, #blocked> loc(#loc47)\n %52 = tt.addptr %51, %50 : tensor<16x1x!tt.ptr, #blocked>, tensor<16x1xi32, #blocked> loc(#loc47)\n %53 = tt.expand_dims %21 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc48)\n %54 = tt.broadcast %52 : tensor<16x1x!tt.ptr, #blocked> -> tensor<16x16x!tt.ptr, #blocked> loc(#loc49)\n %55 = tt.broadcast %53 : tensor<1x16xi32, #blocked> -> tensor<16x16xi32, #blocked> loc(#loc49)\n %56 = tt.addptr %54, %55 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> loc(#loc49)\n %57 = tt.splat %arg3 : i32 -> tensor<16x1xi32, #blocked> loc(#loc50)\n %58 = arith.cmpi slt, %48, %57 : tensor<16x1xi32, #blocked> loc(#loc50)\n %59 = tt.splat %arg4 : i32 -> tensor<1x16xi32, #blocked> loc(#loc51)\n %60 = arith.cmpi slt, %53, %59 : tensor<1x16xi32, #blocked> loc(#loc51)\n %61 = tt.broadcast %58 : tensor<16x1xi1, #blocked> -> tensor<16x16xi1, #blocked> loc(#loc52)\n %62 = tt.broadcast %60 : tensor<1x16xi1, #blocked> -> tensor<16x16xi1, #blocked> loc(#loc52)\n %63 = arith.andi %61, %62 : tensor<16x16xi1, #blocked> loc(#loc52)\n %64 = ttg.convert_layout %47 : tensor<16x16xf16, #blocked1> -> tensor<16x16xf16, #blocked> loc(#loc53)\n tt.store %56, %64, %63 : tensor<16x16x!tt.ptr, #blocked> loc(#loc53)\n tt.return loc(#loc54)\n } loc(#loc)\n} loc(#loc)\n#loc1 = loc(unknown)\n#loc2 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":47:24)\n#loc3 = loc(\"/scratch/findhao/pta/triton/python/triton/language/standard.py\":40:22)\n#loc4 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":48:27)\n#loc5 = loc(\"/scratch/findhao/pta/triton/python/triton/language/standard.py\":40:28)\n#loc6 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":49:27)\n#loc7 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":51:22)\n#loc8 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":53:33)\n#loc9 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":53:46)\n#loc10 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":54:33)\n#loc11 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":54:27)\n#loc12 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":55:19)\n#loc13 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":55:40)\n#loc14 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:23)\n#loc15 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:51)\n#loc16 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:38)\n#loc17 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:68)\n#loc18 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:23)\n#loc19 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:38)\n#loc20 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:68)\n#loc21 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:30)\n#loc22 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:41)\n#loc23 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:60)\n#loc24 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:53)\n#loc25 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:22)\n#loc26 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:29)\n#loc27 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:40)\n#loc28 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:60)\n#loc29 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:52)\n#loc30 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:22)\n#loc31 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":64:33)\n#loc32 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:33)\n#loc33 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:18)\n#loc34 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":64:22)\n#loc35 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:59)\n#loc36 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:55)\n#loc37 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:51)\n#loc38 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:20)\n#loc39 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":66:51)\n#loc40 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":66:20)\n#loc41 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":67:33)\n#loc42 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":68:18)\n#loc43 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:8)\n#loc44 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":70:23)\n#loc45 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:41)\n#loc46 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:33)\n#loc47 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:21)\n#loc48 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:72)\n#loc49 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:52)\n#loc50 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:33)\n#loc51 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:58)\n#loc52 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:39)\n#loc53 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":76:21)\n#loc54 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":76:4)\n#loc55 = loc(callsite(#loc3 at #loc4))\n#loc56 = loc(callsite(#loc5 at #loc4))\n#loc57 = loc(callsite(#loc3 at #loc6))\n#loc58 = loc(callsite(#loc5 at #loc6))\n#loc59 = loc(callsite(#loc3 at #loc31))\n#loc60 = loc(callsite(#loc5 at #loc31))\n","matmul_kernel.llir":"; ModuleID = 'LLVMDialectModule'\nsource_filename = \"LLVMDialectModule\"\ntarget datalayout = \"e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64\"\n\n@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16\n\ndefine ptx_kernel void @matmul_kernel(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9) local_unnamed_addr #0 !dbg !5 {\n %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8\n %12 = add i32 %3, 15, !dbg !9\n %13 = sdiv i32 %12, 16, !dbg !13\n %14 = add i32 %4, 15, !dbg !14\n %15 = sdiv i32 %14, 16, !dbg !16\n %.frozen = freeze i32 %15, !dbg !17\n %16 = sdiv i32 %11, %.frozen, !dbg !17\n %17 = sub i32 %13, %16, !dbg !18\n %18 = tail call i32 @llvm.smin.i32(i32 %17, i32 1), !dbg !19\n %19 = srem i32 %11, %18, !dbg !20\n %20 = add i32 %19, %16, !dbg !21\n %21 = mul i32 %16, %.frozen, !dbg !22\n %.decomposed = sub i32 %11, %21, !dbg !22\n %22 = sdiv i32 %.decomposed, %18, !dbg !23\n %23 = shl i32 %20, 4, !dbg !24\n %24 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !25\n %25 = lshr i32 %24, 1, !dbg !25\n %26 = and i32 %25, 15, !dbg !25\n %27 = shl nuw nsw i32 %24, 3, !dbg !25\n %28 = and i32 %27, 8, !dbg !25\n %29 = or disjoint i32 %23, %26, !dbg !26\n %30 = shl nsw i32 %22, 4, !dbg !27\n %31 = or disjoint i32 %30, %28, !dbg !28\n %32 = add i32 %5, 15, !dbg !29\n %33 = sdiv i32 %32, 16, !dbg !31\n %34 = icmp sgt i32 %32, 15, !dbg !32\n br i1 %34, label %.lr.ph, label %.._crit_edge_crit_edge, !dbg !32\n\n.._crit_edge_crit_edge: ; preds = %10\n %.pre = shl nuw nsw i32 %24, 4, !dbg !33\n br label %._crit_edge, !dbg !32\n\n.lr.ph: ; preds = %10\n %35 = shl i32 %7, 4, !dbg !34\n %36 = srem i32 %31, %4, !dbg !35\n %37 = mul i32 %7, %26, !dbg !36\n %38 = add i32 %36, %37, !dbg !37\n %39 = sext i32 %38 to i64, !dbg !38\n %40 = getelementptr half, ptr addrspace(1) %1, i64 %39, !dbg !38\n %41 = srem i32 %29, %3, !dbg !39\n %42 = mul i32 %41, %6, !dbg !40\n %43 = add i32 %42, %28, !dbg !41\n %44 = sext i32 %43 to i64, !dbg !42\n %45 = getelementptr half, ptr addrspace(1) %0, i64 %44, !dbg !42\n %46 = shl nuw nsw i32 %24, 5\n %47 = and i32 %46, 992\n %48 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %47\n %49 = getelementptr inbounds nuw i8, ptr addrspace(3) %48, i32 16\n %50 = shl nuw nsw i32 %24, 4\n %51 = and i32 %50, 384\n %52 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %51\n %53 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 16\n %54 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 64\n %55 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 80\n %56 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 512\n %57 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 528\n %58 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 576\n %59 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 592\n %60 = and i32 %27, 56\n %61 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %60\n %62 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 64\n %63 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 128\n %64 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 192\n %65 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 256\n %66 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 320\n %67 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 384\n %68 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 448\n %69 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 512\n %70 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 576\n %71 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 640\n %72 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 704\n %73 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 768\n %74 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 832\n %75 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 896\n %76 = getelementptr inbounds nuw i8, ptr addrspace(3) %61, i32 960\n %77 = sext i32 %35 to i64\n %78 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 24\n %79 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 40\n %80 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 56\n %81 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 88\n %82 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 104\n %83 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 120\n %84 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 536\n %85 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 552\n %86 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 568\n %87 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 600\n %88 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 616\n %89 = getelementptr inbounds nuw i8, ptr addrspace(3) %52, i32 632\n br label %90, !dbg !32\n\n90: ; preds = %.lr.ph, %90\n %.pn31132 = phi ptr addrspace(1) [ %40, %.lr.ph ], [ %250, %90 ]\n %.pn15131 = phi ptr addrspace(1) [ %45, %.lr.ph ], [ %249, %90 ]\n %91 = phi i32 [ 0, %.lr.ph ], [ %251, %90 ]\n %92 = phi <8 x float> [ zeroinitializer, %.lr.ph ], [ %248, %90 ]\n %93 = shl i32 %91, 4, !dbg !43\n %94 = sub i32 %5, %93, !dbg !44\n %95 = icmp slt i32 %28, %94, !dbg !45\n %96 = tail call { i32, i32, i32, i32 } asm sideeffect \"mov.u32 $0, $4;\\0A\\09mov.u32 $1, $5;\\0A\\09mov.u32 $2, $6;\\0A\\09mov.u32 $3, $7;\\0A\\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];\", \"=r,=r,=r,=r,r,r,r,r,l,b\"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %.pn15131, i1 %95) #4, !dbg !46\n %97 = extractvalue { i32, i32, i32, i32 } %96, 0, !dbg !46\n %98 = bitcast i32 %97 to <2 x half>, !dbg !46\n %99 = extractvalue { i32, i32, i32, i32 } %96, 1, !dbg !46\n %100 = bitcast i32 %99 to <2 x half>, !dbg !46\n %101 = extractvalue { i32, i32, i32, i32 } %96, 2, !dbg !46\n %102 = bitcast i32 %101 to <2 x half>, !dbg !46\n %103 = extractvalue { i32, i32, i32, i32 } %96, 3, !dbg !46\n %104 = bitcast i32 %103 to <2 x half>, !dbg !46\n %105 = icmp slt i32 %26, %94, !dbg !47\n %106 = tail call { i32, i32, i32, i32 } asm sideeffect \"mov.u32 $0, $4;\\0A\\09mov.u32 $1, $5;\\0A\\09mov.u32 $2, $6;\\0A\\09mov.u32 $3, $7;\\0A\\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];\", \"=r,=r,=r,=r,r,r,r,r,l,b\"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %.pn31132, i1 %105) #4, !dbg !48\n %107 = extractvalue { i32, i32, i32, i32 } %106, 0, !dbg !48\n %108 = bitcast i32 %107 to <2 x half>, !dbg !48\n %109 = extractvalue { i32, i32, i32, i32 } %106, 1, !dbg !48\n %110 = bitcast i32 %109 to <2 x half>, !dbg !48\n %111 = extractvalue { i32, i32, i32, i32 } %106, 2, !dbg !48\n %112 = bitcast i32 %111 to <2 x half>, !dbg !48\n %113 = extractvalue { i32, i32, i32, i32 } %106, 3, !dbg !48\n %114 = bitcast i32 %113 to <2 x half>, !dbg !48\n %115 = shufflevector <2 x half> %98, <2 x half> %100, <4 x i32> , !dbg !49\n %116 = fpext <4 x half> %115 to <4 x float>, !dbg !49\n %117 = shufflevector <2 x half> %102, <2 x half> %104, <4 x i32> , !dbg !49\n %118 = fpext <4 x half> %117 to <4 x float>, !dbg !49\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49\n store <4 x float> %116, ptr addrspace(3) %48, align 16, !dbg !49\n store <4 x float> %118, ptr addrspace(3) %49, align 16, !dbg !49\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49\n %119 = shufflevector <2 x half> %108, <2 x half> %110, <4 x i32> , !dbg !49\n %120 = fpext <4 x half> %119 to <4 x float>, !dbg !49\n %121 = shufflevector <2 x half> %112, <2 x half> %114, <4 x i32> , !dbg !49\n %122 = fpext <4 x half> %121 to <4 x float>, !dbg !49\n %123 = load <4 x float>, ptr addrspace(3) %78, align 8, !dbg !49\n %124 = load <4 x float>, ptr addrspace(3) %79, align 8, !dbg !49\n %125 = load <2 x float>, ptr addrspace(3) %80, align 8, !dbg !49\n %126 = load <4 x float>, ptr addrspace(3) %81, align 8, !dbg !49\n %127 = load <4 x float>, ptr addrspace(3) %82, align 8, !dbg !49\n %128 = load <2 x float>, ptr addrspace(3) %83, align 8, !dbg !49\n %129 = load <4 x float>, ptr addrspace(3) %84, align 8, !dbg !49\n %130 = load <4 x float>, ptr addrspace(3) %85, align 8, !dbg !49\n %131 = load <2 x float>, ptr addrspace(3) %86, align 8, !dbg !49\n %132 = load <4 x float>, ptr addrspace(3) %87, align 8, !dbg !49\n %133 = load <4 x float>, ptr addrspace(3) %88, align 8, !dbg !49\n %134 = load <2 x float>, ptr addrspace(3) %89, align 8, !dbg !49\n %135 = load <4 x float>, ptr addrspace(3) %52, align 16, !dbg !49\n %136 = load <2 x float>, ptr addrspace(3) %53, align 16, !dbg !49\n %137 = load <4 x float>, ptr addrspace(3) %54, align 16, !dbg !49\n %138 = load <2 x float>, ptr addrspace(3) %55, align 16, !dbg !49\n %139 = load <4 x float>, ptr addrspace(3) %56, align 16, !dbg !49\n %140 = load <2 x float>, ptr addrspace(3) %57, align 16, !dbg !49\n %141 = load <4 x float>, ptr addrspace(3) %58, align 16, !dbg !49\n %142 = load <2 x float>, ptr addrspace(3) %59, align 16, !dbg !49\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49\n store <4 x float> %120, ptr addrspace(3) %48, align 16, !dbg !49\n store <4 x float> %122, ptr addrspace(3) %49, align 16, !dbg !49\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49\n %143 = load <2 x float>, ptr addrspace(3) %61, align 8, !dbg !49\n %144 = shufflevector <2 x float> %143, <2 x float> poison, <8 x i32> , !dbg !49\n %145 = load <2 x float>, ptr addrspace(3) %62, align 8, !dbg !49\n %146 = shufflevector <2 x float> %145, <2 x float> poison, <8 x i32> , !dbg !49\n %147 = load <2 x float>, ptr addrspace(3) %63, align 8, !dbg !49\n %148 = shufflevector <2 x float> %147, <2 x float> poison, <8 x i32> , !dbg !49\n %149 = load <2 x float>, ptr addrspace(3) %64, align 8, !dbg !49\n %150 = shufflevector <2 x float> %149, <2 x float> poison, <8 x i32> , !dbg !49\n %151 = load <2 x float>, ptr addrspace(3) %65, align 8, !dbg !49\n %152 = shufflevector <2 x float> %151, <2 x float> poison, <8 x i32> , !dbg !49\n %153 = load <2 x float>, ptr addrspace(3) %66, align 8, !dbg !49\n %154 = shufflevector <2 x float> %153, <2 x float> poison, <8 x i32> , !dbg !49\n %155 = load <2 x float>, ptr addrspace(3) %67, align 8, !dbg !49\n %156 = shufflevector <2 x float> %155, <2 x float> poison, <8 x i32> , !dbg !49\n %157 = load <2 x float>, ptr addrspace(3) %68, align 8, !dbg !49\n %158 = shufflevector <2 x float> %157, <2 x float> poison, <8 x i32> , !dbg !49\n %159 = load <2 x float>, ptr addrspace(3) %69, align 8, !dbg !49\n %160 = shufflevector <2 x float> %159, <2 x float> poison, <8 x i32> , !dbg !49\n %161 = load <2 x float>, ptr addrspace(3) %70, align 8, !dbg !49\n %162 = shufflevector <2 x float> %161, <2 x float> poison, <8 x i32> , !dbg !49\n %163 = load <2 x float>, ptr addrspace(3) %71, align 8, !dbg !49\n %164 = shufflevector <2 x float> %163, <2 x float> poison, <8 x i32> , !dbg !49\n %165 = load <2 x float>, ptr addrspace(3) %72, align 8, !dbg !49\n %166 = shufflevector <2 x float> %165, <2 x float> poison, <8 x i32> , !dbg !49\n %167 = load <2 x float>, ptr addrspace(3) %73, align 8, !dbg !49\n %168 = shufflevector <2 x float> %167, <2 x float> poison, <8 x i32> , !dbg !49\n %169 = load <2 x float>, ptr addrspace(3) %74, align 8, !dbg !49\n %170 = shufflevector <2 x float> %169, <2 x float> poison, <8 x i32> , !dbg !49\n %171 = load <2 x float>, ptr addrspace(3) %75, align 8, !dbg !49\n %172 = shufflevector <2 x float> %171, <2 x float> poison, <8 x i32> , !dbg !49\n %173 = load <2 x float>, ptr addrspace(3) %76, align 8, !dbg !49\n %174 = shufflevector <2 x float> %173, <2 x float> poison, <8 x i32> , !dbg !49\n %175 = shufflevector <4 x float> %135, <4 x float> %137, <8 x i32> , !dbg !49\n %176 = shufflevector <4 x float> %139, <4 x float> poison, <8 x i32> , !dbg !49\n %177 = shufflevector <8 x float> %175, <8 x float> %176, <8 x i32> , !dbg !49\n %178 = shufflevector <4 x float> %141, <4 x float> poison, <8 x i32> , !dbg !49\n %179 = shufflevector <8 x float> %177, <8 x float> %178, <8 x i32> , !dbg !49\n %180 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %179, <8 x float> %144, <8 x float> %92), !dbg !49\n %181 = shufflevector <4 x float> %135, <4 x float> %137, <8 x i32> , !dbg !49\n %182 = shufflevector <8 x float> %181, <8 x float> %176, <8 x i32> , !dbg !49\n %183 = shufflevector <8 x float> %182, <8 x float> %178, <8 x i32> , !dbg !49\n %184 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %183, <8 x float> %146, <8 x float> %180), !dbg !49\n %185 = shufflevector <4 x float> %135, <4 x float> %137, <8 x i32> , !dbg !49\n %186 = shufflevector <8 x float> %185, <8 x float> %176, <8 x i32> , !dbg !49\n %187 = shufflevector <8 x float> %186, <8 x float> %178, <8 x i32> , !dbg !49\n %188 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %187, <8 x float> %148, <8 x float> %184), !dbg !49\n %189 = shufflevector <4 x float> %135, <4 x float> %137, <8 x i32> , !dbg !49\n %190 = shufflevector <8 x float> %189, <8 x float> %176, <8 x i32> , !dbg !49\n %191 = shufflevector <8 x float> %190, <8 x float> %178, <8 x i32> , !dbg !49\n %192 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %191, <8 x float> %150, <8 x float> %188), !dbg !49\n %193 = shufflevector <2 x float> %136, <2 x float> %138, <8 x i32> , !dbg !49\n %194 = shufflevector <2 x float> %140, <2 x float> poison, <8 x i32> , !dbg !49\n %195 = shufflevector <8 x float> %193, <8 x float> %194, <8 x i32> , !dbg !49\n %196 = shufflevector <2 x float> %142, <2 x float> poison, <8 x i32> , !dbg !49\n %197 = shufflevector <8 x float> %195, <8 x float> %196, <8 x i32> , !dbg !49\n %198 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %197, <8 x float> %152, <8 x float> %192), !dbg !49\n %199 = shufflevector <2 x float> %136, <2 x float> %138, <8 x i32> , !dbg !49\n %200 = shufflevector <8 x float> %199, <8 x float> %194, <8 x i32> , !dbg !49\n %201 = shufflevector <8 x float> %200, <8 x float> %196, <8 x i32> , !dbg !49\n %202 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %201, <8 x float> %154, <8 x float> %198), !dbg !49\n %203 = shufflevector <4 x float> %123, <4 x float> %126, <8 x i32> , !dbg !49\n %204 = shufflevector <4 x float> %129, <4 x float> poison, <8 x i32> , !dbg !49\n %205 = shufflevector <8 x float> %203, <8 x float> %204, <8 x i32> , !dbg !49\n %206 = shufflevector <4 x float> %132, <4 x float> poison, <8 x i32> , !dbg !49\n %207 = shufflevector <8 x float> %205, <8 x float> %206, <8 x i32> , !dbg !49\n %208 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %207, <8 x float> %156, <8 x float> %202), !dbg !49\n %209 = shufflevector <4 x float> %123, <4 x float> %126, <8 x i32> , !dbg !49\n %210 = shufflevector <8 x float> %209, <8 x float> %204, <8 x i32> , !dbg !49\n %211 = shufflevector <8 x float> %210, <8 x float> %206, <8 x i32> , !dbg !49\n %212 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %211, <8 x float> %158, <8 x float> %208), !dbg !49\n %213 = shufflevector <4 x float> %123, <4 x float> %126, <8 x i32> , !dbg !49\n %214 = shufflevector <8 x float> %213, <8 x float> %204, <8 x i32> , !dbg !49\n %215 = shufflevector <8 x float> %214, <8 x float> %206, <8 x i32> , !dbg !49\n %216 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %215, <8 x float> %160, <8 x float> %212), !dbg !49\n %217 = shufflevector <4 x float> %123, <4 x float> %126, <8 x i32> , !dbg !49\n %218 = shufflevector <8 x float> %217, <8 x float> %204, <8 x i32> , !dbg !49\n %219 = shufflevector <8 x float> %218, <8 x float> %206, <8 x i32> , !dbg !49\n %220 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %219, <8 x float> %162, <8 x float> %216), !dbg !49\n %221 = shufflevector <4 x float> %124, <4 x float> %127, <8 x i32> , !dbg !49\n %222 = shufflevector <4 x float> %130, <4 x float> poison, <8 x i32> , !dbg !49\n %223 = shufflevector <8 x float> %221, <8 x float> %222, <8 x i32> , !dbg !49\n %224 = shufflevector <4 x float> %133, <4 x float> poison, <8 x i32> , !dbg !49\n %225 = shufflevector <8 x float> %223, <8 x float> %224, <8 x i32> , !dbg !49\n %226 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %225, <8 x float> %164, <8 x float> %220), !dbg !49\n %227 = shufflevector <4 x float> %124, <4 x float> %127, <8 x i32> , !dbg !49\n %228 = shufflevector <8 x float> %227, <8 x float> %222, <8 x i32> , !dbg !49\n %229 = shufflevector <8 x float> %228, <8 x float> %224, <8 x i32> , !dbg !49\n %230 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %229, <8 x float> %166, <8 x float> %226), !dbg !49\n %231 = shufflevector <4 x float> %124, <4 x float> %127, <8 x i32> , !dbg !49\n %232 = shufflevector <8 x float> %231, <8 x float> %222, <8 x i32> , !dbg !49\n %233 = shufflevector <8 x float> %232, <8 x float> %224, <8 x i32> , !dbg !49\n %234 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %233, <8 x float> %168, <8 x float> %230), !dbg !49\n %235 = shufflevector <4 x float> %124, <4 x float> %127, <8 x i32> , !dbg !49\n %236 = shufflevector <8 x float> %235, <8 x float> %222, <8 x i32> , !dbg !49\n %237 = shufflevector <8 x float> %236, <8 x float> %224, <8 x i32> , !dbg !49\n %238 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %237, <8 x float> %170, <8 x float> %234), !dbg !49\n %239 = shufflevector <2 x float> %125, <2 x float> %128, <8 x i32> , !dbg !49\n %240 = shufflevector <2 x float> %131, <2 x float> poison, <8 x i32> , !dbg !49\n %241 = shufflevector <8 x float> %239, <8 x float> %240, <8 x i32> , !dbg !49\n %242 = shufflevector <2 x float> %134, <2 x float> poison, <8 x i32> , !dbg !49\n %243 = shufflevector <8 x float> %241, <8 x float> %242, <8 x i32> , !dbg !49\n %244 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %243, <8 x float> %172, <8 x float> %238), !dbg !49\n %245 = shufflevector <2 x float> %125, <2 x float> %128, <8 x i32> , !dbg !49\n %246 = shufflevector <8 x float> %245, <8 x float> %240, <8 x i32> , !dbg !49\n %247 = shufflevector <8 x float> %246, <8 x float> %242, <8 x i32> , !dbg !49\n %248 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %247, <8 x float> %174, <8 x float> %244), !dbg !49\n %249 = getelementptr i8, ptr addrspace(1) %.pn15131, i64 32, !dbg !50\n %250 = getelementptr half, ptr addrspace(1) %.pn31132, i64 %77, !dbg !51\n %251 = add nuw nsw i32 %91, 1, !dbg !32\n %exitcond.not = icmp eq i32 %251, %33, !dbg !32\n br i1 %exitcond.not, label %._crit_edge.loopexit, label %90, !dbg !32\n\n._crit_edge.loopexit: ; preds = %90\n %252 = fptrunc <8 x float> %248 to <8 x half>, !dbg !52\n br label %._crit_edge, !dbg !52\n\n._crit_edge: ; preds = %._crit_edge.loopexit, %.._crit_edge_crit_edge\n %.pre-phi = phi i32 [ %.pre, %.._crit_edge_crit_edge ], [ %50, %._crit_edge.loopexit ], !dbg !33\n %253 = phi <8 x half> [ zeroinitializer, %.._crit_edge_crit_edge ], [ %252, %._crit_edge.loopexit ]\n %254 = mul i32 %29, %8, !dbg !53\n %255 = sext i32 %254 to i64, !dbg !54\n %256 = getelementptr half, ptr addrspace(1) %2, i64 %255, !dbg !54\n %257 = sext i32 %31 to i64, !dbg !55\n %258 = getelementptr half, ptr addrspace(1) %256, i64 %257, !dbg !55\n %259 = icmp slt i32 %29, %3, !dbg !56\n %260 = icmp slt i32 %31, %4, !dbg !57\n %261 = and i1 %259, %260, !dbg !58\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !33\n %262 = and i32 %.pre-phi, 64, !dbg !33\n %263 = and i32 %24, 3, !dbg !33\n %264 = shl nuw nsw i32 %263, 2, !dbg !33\n %265 = or disjoint i32 %262, %264, !dbg !33\n %266 = shl nuw nsw i32 %24, 1, !dbg !33\n %267 = and i32 %266, 48, !dbg !33\n %268 = or disjoint i32 %265, %267, !dbg !33\n %269 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %268, !dbg !33\n %270 = shufflevector <8 x half> %253, <8 x half> poison, <2 x i32> , !dbg !33\n store <2 x half> %270, ptr addrspace(3) %269, align 4, !dbg !33\n %271 = xor i32 %268, 132, !dbg !33\n %272 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %271, !dbg !33\n %273 = shufflevector <8 x half> %253, <8 x half> poison, <2 x i32> , !dbg !33\n store <2 x half> %273, ptr addrspace(3) %272, align 4, !dbg !33\n %274 = xor i32 %268, 264, !dbg !33\n %275 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %274, !dbg !33\n %276 = shufflevector <8 x half> %253, <8 x half> poison, <2 x i32> , !dbg !33\n store <2 x half> %276, ptr addrspace(3) %275, align 4, !dbg !33\n %277 = xor i32 %268, 396, !dbg !33\n %278 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %277, !dbg !33\n %279 = shufflevector <8 x half> %253, <8 x half> poison, <2 x i32> , !dbg !33\n store <2 x half> %279, ptr addrspace(3) %278, align 4, !dbg !33\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !33\n %280 = shl nuw nsw i32 %263, 6, !dbg !33\n %281 = and i32 %24, 16, !dbg !33\n %282 = shl nuw nsw i32 %281, 4, !dbg !33\n %283 = or disjoint i32 %280, %282, !dbg !33\n %284 = shl nuw nsw i32 %24, 2, !dbg !33\n %285 = and i32 %284, 48, !dbg !33\n %286 = or disjoint i32 %283, %285, !dbg !33\n %287 = and i32 %266, 4, !dbg !33\n %288 = or disjoint i32 %286, %287, !dbg !33\n %289 = lshr exact i32 %281, 1, !dbg !33\n %290 = or disjoint i32 %288, %289, !dbg !33\n %291 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %290, !dbg !33\n %292 = load i32, ptr addrspace(3) %291, align 4, !dbg !33\n %293 = xor i32 %290, 4, !dbg !33\n %294 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %293, !dbg !33\n %295 = load i32, ptr addrspace(3) %294, align 4, !dbg !33\n %296 = xor i32 %290, 8, !dbg !33\n %297 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %296, !dbg !33\n %298 = load i32, ptr addrspace(3) %297, align 4, !dbg !33\n %299 = xor i32 %290, 12, !dbg !33\n %300 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %299, !dbg !33\n %301 = load i32, ptr addrspace(3) %300, align 4, !dbg !33\n tail call void asm sideeffect \"@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };\", \"r,r,r,r,l,b\"(i32 %292, i32 %295, i32 %298, i32 %301, ptr addrspace(1) %258, i1 %261) #4, !dbg !33\n ret void, !dbg !59\n}\n\n; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1\n\n; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare i32 @llvm.smin.i32(i32, i32) #1\n\n; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1\n\n; Function Attrs: convergent nocallback nounwind\ndeclare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2\n\n; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) #3\n\nattributes #0 = { \"nvvm.reqntid\"=\"32\" }\nattributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }\nattributes #2 = { convergent nocallback nounwind }\nattributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }\nattributes #4 = { nounwind }\n\n!llvm.dbg.cu = !{!0}\n!llvm.module.flags = !{!2, !3}\n!llvm.ident = !{!4}\n\n!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: \"triton\", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)\n!1 = !DIFile(filename: \"test_complex_kernels.py\", directory: \"/scratch/findhao/tritonparse/tests\")\n!2 = !{i32 2, !\"Debug Info Version\", i32 3}\n!3 = !{i32 4, !\"nvvm-reflect-ftz\", i32 1}\n!4 = !{!\"clang version 3.8.0 (tags/RELEASE_380/final)\"}\n!5 = distinct !DISubprogram(name: \"matmul_kernel\", linkageName: \"matmul_kernel\", scope: !1, file: !1, line: 38, type: !6, scopeLine: 38, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)\n!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)\n!7 = !{}\n!8 = !DILocation(line: 47, column: 24, scope: !5)\n!9 = !DILocation(line: 40, column: 22, scope: !10, inlinedAt: !12)\n!10 = distinct !DILexicalBlockFile(scope: !5, file: !11, discriminator: 0)\n!11 = !DIFile(filename: \"standard.py\", directory: \"/scratch/findhao/pta/triton/python/triton/language\")\n!12 = !DILocation(line: 48, column: 27, scope: !5)\n!13 = !DILocation(line: 40, column: 28, scope: !10, inlinedAt: !12)\n!14 = !DILocation(line: 40, column: 22, scope: !10, inlinedAt: !15)\n!15 = !DILocation(line: 49, column: 27, scope: !5)\n!16 = !DILocation(line: 40, column: 28, scope: !10, inlinedAt: !15)\n!17 = !DILocation(line: 51, column: 22, scope: !5)\n!18 = !DILocation(line: 53, column: 33, scope: !5)\n!19 = !DILocation(line: 53, column: 46, scope: !5)\n!20 = !DILocation(line: 54, column: 33, scope: !5)\n!21 = !DILocation(line: 54, column: 27, scope: !5)\n!22 = !DILocation(line: 55, column: 19, scope: !5)\n!23 = !DILocation(line: 55, column: 40, scope: !5)\n!24 = !DILocation(line: 57, column: 23, scope: !5)\n!25 = !DILocation(line: 57, column: 51, scope: !5)\n!26 = !DILocation(line: 57, column: 38, scope: !5)\n!27 = !DILocation(line: 58, column: 23, scope: !5)\n!28 = !DILocation(line: 58, column: 38, scope: !5)\n!29 = !DILocation(line: 40, column: 22, scope: !10, inlinedAt: !30)\n!30 = !DILocation(line: 64, column: 33, scope: !5)\n!31 = !DILocation(line: 40, column: 28, scope: !10, inlinedAt: !30)\n!32 = !DILocation(line: 64, column: 22, scope: !5)\n!33 = !DILocation(line: 76, column: 21, scope: !5)\n!34 = !DILocation(line: 69, column: 33, scope: !5)\n!35 = !DILocation(line: 58, column: 68, scope: !5)\n!36 = !DILocation(line: 61, column: 40, scope: !5)\n!37 = !DILocation(line: 61, column: 52, scope: !5)\n!38 = !DILocation(line: 61, column: 22, scope: !5)\n!39 = !DILocation(line: 57, column: 68, scope: !5)\n!40 = !DILocation(line: 60, column: 41, scope: !5)\n!41 = !DILocation(line: 60, column: 53, scope: !5)\n!42 = !DILocation(line: 60, column: 22, scope: !5)\n!43 = !DILocation(line: 65, column: 59, scope: !5)\n!44 = !DILocation(line: 65, column: 55, scope: !5)\n!45 = !DILocation(line: 65, column: 51, scope: !5)\n!46 = !DILocation(line: 65, column: 20, scope: !5)\n!47 = !DILocation(line: 66, column: 51, scope: !5)\n!48 = !DILocation(line: 66, column: 20, scope: !5)\n!49 = !DILocation(line: 67, column: 33, scope: !5)\n!50 = !DILocation(line: 68, column: 18, scope: !5)\n!51 = !DILocation(line: 69, column: 18, scope: !5)\n!52 = !DILocation(line: 70, column: 23, scope: !5)\n!53 = !DILocation(line: 74, column: 33, scope: !5)\n!54 = !DILocation(line: 74, column: 21, scope: !5)\n!55 = !DILocation(line: 74, column: 52, scope: !5)\n!56 = !DILocation(line: 75, column: 33, scope: !5)\n!57 = !DILocation(line: 75, column: 58, scope: !5)\n!58 = !DILocation(line: 75, column: 39, scope: !5)\n!59 = !DILocation(line: 76, column: 4, scope: !5)\n","matmul_kernel.ptx":"//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 8.7\n.target sm_75\n.address_size 64\n\n\t// .globl\tmatmul_kernel // -- Begin function matmul_kernel\n.extern .shared .align 16 .b8 global_smem[];\n // @matmul_kernel\n.visible .entry matmul_kernel(\n\t.param .u64 .ptr .global .align 1 matmul_kernel_param_0,\n\t.param .u64 .ptr .global .align 1 matmul_kernel_param_1,\n\t.param .u64 .ptr .global .align 1 matmul_kernel_param_2,\n\t.param .u32 matmul_kernel_param_3,\n\t.param .u32 matmul_kernel_param_4,\n\t.param .u32 matmul_kernel_param_5,\n\t.param .u32 matmul_kernel_param_6,\n\t.param .u32 matmul_kernel_param_7,\n\t.param .u32 matmul_kernel_param_8,\n\t.param .u64 .ptr .global .align 1 matmul_kernel_param_9\n)\n.reqntid 32\n{\n\t.reg .pred \t%p<8>;\n\t.reg .b16 \t%rs<25>;\n\t.reg .b32 \t%r<398>;\n\t.reg .b64 \t%rd<21>;\n\t.loc\t1 38 0 // test_complex_kernels.py:38:0\n$L__func_begin0:\n\t.loc\t1 38 0 // test_complex_kernels.py:38:0\n\n// %bb.0:\n\tld.param.b32 \t%r47, [matmul_kernel_param_8];\n\tld.param.b32 \t%r383, [matmul_kernel_param_5];\n\tld.param.b32 \t%r43, [matmul_kernel_param_4];\n\tld.param.b32 \t%r42, [matmul_kernel_param_3];\n\tld.param.b64 \t%rd10, [matmul_kernel_param_2];\n$L__tmp0:\n\t.loc\t1 47 24 // test_complex_kernels.py:47:24\n\tmov.u32 \t%r48, %ctaid.x;\n$L__tmp1:\n\t.loc\t2 40 22 // standard.py:40:22 @[ test_complex_kernels.py:48:27 ]\n\tadd.s32 \t%r49, %r42, 15;\n\t.loc\t2 40 28 // standard.py:40:28 @[ test_complex_kernels.py:48:27 ]\n\tshr.s32 \t%r50, %r49, 31;\n\tshr.u32 \t%r51, %r50, 28;\n\tadd.s32 \t%r52, %r49, %r51;\n\tshr.s32 \t%r53, %r52, 4;\n$L__tmp2:\n\t.loc\t2 40 22 // standard.py:40:22 @[ test_complex_kernels.py:49:27 ]\n\tadd.s32 \t%r54, %r43, 15;\n\t.loc\t2 40 28 // standard.py:40:28 @[ test_complex_kernels.py:49:27 ]\n\tshr.s32 \t%r55, %r54, 31;\n\tshr.u32 \t%r56, %r55, 28;\n\tadd.s32 \t%r57, %r54, %r56;\n\tshr.s32 \t%r58, %r57, 4;\n$L__tmp3:\n\t.loc\t1 51 22 // test_complex_kernels.py:51:22\n\tdiv.s32 \t%r60, %r48, %r58;\n\t.loc\t1 53 33 // test_complex_kernels.py:53:33\n\tsub.s32 \t%r61, %r53, %r60;\n\t.loc\t1 53 46 // test_complex_kernels.py:53:46\n\tmin.s32 \t%r62, %r61, 1;\n\t.loc\t1 54 33 // test_complex_kernels.py:54:33\n\trem.s32 \t%r63, %r48, %r62;\n\t.loc\t1 54 27 // test_complex_kernels.py:54:27\n\tadd.s32 \t%r64, %r63, %r60;\n\t.loc\t1 55 19 // test_complex_kernels.py:55:19\n\tmul.lo.s32 \t%r65, %r60, %r58;\n\tsub.s32 \t%r66, %r48, %r65;\n\t.loc\t1 55 40 // test_complex_kernels.py:55:40\n\tdiv.s32 \t%r67, %r66, %r62;\n\t.loc\t1 57 23 // test_complex_kernels.py:57:23\n\tshl.b32 \t%r68, %r64, 4;\n\t.loc\t1 57 51 // test_complex_kernels.py:57:51\n\tmov.u32 \t%r1, %tid.x;\n\tbfe.u32 \t%r2, %r1, 1, 4;\n\tshl.b32 \t%r3, %r1, 3;\n\tand.b32 \t%r4, %r3, 8;\n\t.loc\t1 57 38 // test_complex_kernels.py:57:38\n\tor.b32 \t%r5, %r68, %r2;\n\t.loc\t1 58 23 // test_complex_kernels.py:58:23\n\tshl.b32 \t%r69, %r67, 4;\n\t.loc\t1 58 38 // test_complex_kernels.py:58:38\n\tor.b32 \t%r6, %r69, %r4;\n$L__tmp4:\n\t.loc\t2 40 22 // standard.py:40:22 @[ test_complex_kernels.py:64:33 ]\n\tadd.s32 \t%r70, %r383, 15;\n$L__tmp5:\n\t.loc\t1 64 22 // test_complex_kernels.py:64:22\n\tsetp.gt.s32 \t%p1, %r70, 15;\n\tmov.b32 \t%r382, global_smem;\n\t@%p1 bra \t$L__BB0_2;\n\tbra.uni \t$L__BB0_1;\n$L__BB0_2: // %.lr.ph\n\t.loc\t1 0 22 // test_complex_kernels.py:0:22\n\tld.param.b32 \t%r46, [matmul_kernel_param_7];\n\tld.param.b32 \t%r45, [matmul_kernel_param_6];\n\tld.param.b64 \t%rd9, [matmul_kernel_param_1];\n\tld.param.b64 \t%rd8, [matmul_kernel_param_0];\n\tshr.s32 \t%r71, %r70, 31;\n\tshr.u32 \t%r72, %r71, 28;\n\tadd.s32 \t%r73, %r70, %r72;\n\tshr.s32 \t%r384, %r73, 4;\n\t.loc\t1 69 33 // test_complex_kernels.py:69:33\n\tshl.b32 \t%r86, %r46, 4;\n\t.loc\t1 58 68 // test_complex_kernels.py:58:68\n\trem.s32 \t%r87, %r6, %r43;\n\t.loc\t1 61 52 // test_complex_kernels.py:61:52\n\tmad.lo.s32 \t%r88, %r46, %r2, %r87;\n\t.loc\t1 61 22 // test_complex_kernels.py:61:22\n\tmul.wide.s32 \t%rd11, %r88, 2;\n\tadd.s64 \t%rd19, %rd9, %rd11;\n\t.loc\t1 57 68 // test_complex_kernels.py:57:68\n\trem.s32 \t%r89, %r5, %r42;\n\t.loc\t1 60 53 // test_complex_kernels.py:60:53\n\tmad.lo.s32 \t%r90, %r89, %r45, %r4;\n\t.loc\t1 60 22 // test_complex_kernels.py:60:22\n\tmul.wide.s32 \t%rd12, %r90, 2;\n\tadd.s64 \t%rd20, %rd8, %rd12;\n\tshl.b32 \t%r91, %r1, 5;\n\tand.b32 \t%r92, %r91, 992;\n\tadd.s32 \t%r9, %r382, %r92;\n\tshl.b32 \t%r393, %r1, 4;\n\tand.b32 \t%r94, %r393, 384;\n\tadd.s32 \t%r11, %r382, %r94;\n\tand.b32 \t%r95, %r3, 56;\n\tadd.s32 \t%r12, %r382, %r95;\n\t.loc\t1 64 22 // test_complex_kernels.py:64:22\n\tmul.wide.s32 \t%rd3, %r86, 2;\n\tmov.b32 \t%r385, 0f00000000;\n\tmov.b32 \t%r386, %r385;\n\tmov.b32 \t%r387, %r385;\n\tmov.b32 \t%r388, %r385;\n\tmov.b32 \t%r389, %r385;\n\tmov.b32 \t%r390, %r385;\n\tmov.b32 \t%r391, %r385;\n\tmov.b32 \t%r392, %r385;\n$L__BB0_3: // =>This Inner Loop Header: Depth=1\n\t.loc\t1 65 51 // test_complex_kernels.py:65:51\n\tsetp.lt.s32 \t%p2, %r4, %r383;\n\tmov.b32 \t%r100, 0;\n\t.loc\t1 65 20 // test_complex_kernels.py:65:20\n\t// begin inline asm\n\tmov.u32 %r96, %r100;\n\tmov.u32 %r97, %r100;\n\tmov.u32 %r98, %r100;\n\tmov.u32 %r99, %r100;\n\t@%p2 ld.global.v4.b32 { %r96, %r97, %r98, %r99 }, [ %rd20 + 0 ];\n\t// end inline asm\n\t.loc\t1 66 51 // test_complex_kernels.py:66:51\n\tsetp.lt.s32 \t%p3, %r2, %r383;\n\t.loc\t1 66 20 // test_complex_kernels.py:66:20\n\t// begin inline asm\n\tmov.u32 %r104, %r100;\n\tmov.u32 %r105, %r100;\n\tmov.u32 %r106, %r100;\n\tmov.u32 %r107, %r100;\n\t@%p3 ld.global.v4.b32 { %r104, %r105, %r106, %r107 }, [ %rd19 + 0 ];\n\t// end inline asm\n\t.loc\t1 67 33 // test_complex_kernels.py:67:33\n\tmov.b32 \t{%rs1, %rs2}, %r97;\n\tcvt.f32.f16 \t%r112, %rs2;\n\tcvt.f32.f16 \t%r113, %rs1;\n\tmov.b32 \t{%rs3, %rs4}, %r96;\n\tcvt.f32.f16 \t%r114, %rs4;\n\tcvt.f32.f16 \t%r115, %rs3;\n\tmov.b32 \t{%rs5, %rs6}, %r99;\n\tcvt.f32.f16 \t%r116, %rs6;\n\tcvt.f32.f16 \t%r117, %rs5;\n\tmov.b32 \t{%rs7, %rs8}, %r98;\n\tcvt.f32.f16 \t%r118, %rs8;\n\tcvt.f32.f16 \t%r119, %rs7;\n\tbar.sync \t0;\n\tst.shared.v4.b32 \t[%r9], {%r115, %r114, %r113, %r112};\n\tst.shared.v4.b32 \t[%r9+16], {%r119, %r118, %r117, %r116};\n\tbar.sync \t0;\n\tmov.b32 \t{%rs9, %rs10}, %r105;\n\tcvt.f32.f16 \t%r120, %rs10;\n\tcvt.f32.f16 \t%r121, %rs9;\n\tmov.b32 \t{%rs11, %rs12}, %r104;\n\tcvt.f32.f16 \t%r122, %rs12;\n\tcvt.f32.f16 \t%r123, %rs11;\n\tmov.b32 \t{%rs13, %rs14}, %r107;\n\tcvt.f32.f16 \t%r124, %rs14;\n\tcvt.f32.f16 \t%r125, %rs13;\n\tmov.b32 \t{%rs15, %rs16}, %r106;\n\tcvt.f32.f16 \t%r126, %rs16;\n\tcvt.f32.f16 \t%r127, %rs15;\n\tld.shared.v2.b32 \t{%r128, %r129}, [%r11+32];\n\tld.shared.v2.b32 \t{%r130, %r131}, [%r11+24];\n\tld.shared.v2.b32 \t{%r132, %r133}, [%r11+48];\n\tld.shared.v2.b32 \t{%r134, %r135}, [%r11+40];\n\tld.shared.v2.b32 \t{%r136, %r137}, [%r11+56];\n\tld.shared.v2.b32 \t{%r138, %r139}, [%r11+96];\n\tld.shared.v2.b32 \t{%r140, %r141}, [%r11+88];\n\tld.shared.v2.b32 \t{%r142, %r143}, [%r11+112];\n\tld.shared.v2.b32 \t{%r144, %r145}, [%r11+104];\n\tld.shared.v2.b32 \t{%r146, %r147}, [%r11+120];\n\tld.shared.v2.b32 \t{%r148, %r149}, [%r11+544];\n\tld.shared.v2.b32 \t{%r150, %r151}, [%r11+536];\n\tld.shared.v2.b32 \t{%r152, %r153}, [%r11+560];\n\tld.shared.v2.b32 \t{%r154, %r155}, [%r11+552];\n\tld.shared.v2.b32 \t{%r156, %r157}, [%r11+568];\n\tld.shared.v2.b32 \t{%r158, %r159}, [%r11+608];\n\tld.shared.v2.b32 \t{%r160, %r161}, [%r11+600];\n\tld.shared.v2.b32 \t{%r162, %r163}, [%r11+624];\n\tld.shared.v2.b32 \t{%r164, %r165}, [%r11+616];\n\tld.shared.v2.b32 \t{%r166, %r167}, [%r11+632];\n\tld.shared.v4.b32 \t{%r168, %r169, %r170, %r171}, [%r11];\n\tld.shared.v2.b32 \t{%r172, %r173}, [%r11+16];\n\tld.shared.v4.b32 \t{%r174, %r175, %r176, %r177}, [%r11+64];\n\tld.shared.v2.b32 \t{%r178, %r179}, [%r11+80];\n\tld.shared.v4.b32 \t{%r180, %r181, %r182, %r183}, [%r11+512];\n\tld.shared.v2.b32 \t{%r184, %r185}, [%r11+528];\n\tld.shared.v4.b32 \t{%r186, %r187, %r188, %r189}, [%r11+576];\n\tld.shared.v2.b32 \t{%r190, %r191}, [%r11+592];\n\tbar.sync \t0;\n\tst.shared.v4.b32 \t[%r9], {%r123, %r122, %r121, %r120};\n\tst.shared.v4.b32 \t[%r9+16], {%r127, %r126, %r125, %r124};\n\tbar.sync \t0;\n\tld.shared.v2.b32 \t{%r192, %r193}, [%r12];\n\tld.shared.v2.b32 \t{%r194, %r195}, [%r12+64];\n\tld.shared.v2.b32 \t{%r196, %r197}, [%r12+128];\n\tld.shared.v2.b32 \t{%r198, %r199}, [%r12+192];\n\tld.shared.v2.b32 \t{%r200, %r201}, [%r12+256];\n\tld.shared.v2.b32 \t{%r202, %r203}, [%r12+320];\n\tld.shared.v2.b32 \t{%r204, %r205}, [%r12+384];\n\tld.shared.v2.b32 \t{%r206, %r207}, [%r12+448];\n\tld.shared.v2.b32 \t{%r208, %r209}, [%r12+512];\n\tld.shared.v2.b32 \t{%r210, %r211}, [%r12+576];\n\tld.shared.v2.b32 \t{%r212, %r213}, [%r12+640];\n\tld.shared.v2.b32 \t{%r214, %r215}, [%r12+704];\n\tld.shared.v2.b32 \t{%r216, %r217}, [%r12+768];\n\tld.shared.v2.b32 \t{%r218, %r219}, [%r12+832];\n\tld.shared.v2.b32 \t{%r220, %r221}, [%r12+896];\n\tld.shared.v2.b32 \t{%r222, %r223}, [%r12+960];\n\tfma.rn.f32 \t%r224, %r168, %r192, %r385;\n\tfma.rn.f32 \t%r225, %r168, %r193, %r386;\n\tfma.rn.f32 \t%r226, %r174, %r192, %r387;\n\tfma.rn.f32 \t%r227, %r174, %r193, %r388;\n\tfma.rn.f32 \t%r228, %r180, %r192, %r389;\n\tfma.rn.f32 \t%r229, %r180, %r193, %r390;\n\tfma.rn.f32 \t%r230, %r186, %r192, %r391;\n\tfma.rn.f32 \t%r231, %r186, %r193, %r392;\n\tfma.rn.f32 \t%r232, %r187, %r195, %r231;\n\tfma.rn.f32 \t%r233, %r187, %r194, %r230;\n\tfma.rn.f32 \t%r234, %r181, %r195, %r229;\n\tfma.rn.f32 \t%r235, %r181, %r194, %r228;\n\tfma.rn.f32 \t%r236, %r175, %r195, %r227;\n\tfma.rn.f32 \t%r237, %r175, %r194, %r226;\n\tfma.rn.f32 \t%r238, %r169, %r195, %r225;\n\tfma.rn.f32 \t%r239, %r169, %r194, %r224;\n\tfma.rn.f32 \t%r240, %r170, %r196, %r239;\n\tfma.rn.f32 \t%r241, %r170, %r197, %r238;\n\tfma.rn.f32 \t%r242, %r176, %r196, %r237;\n\tfma.rn.f32 \t%r243, %r176, %r197, %r236;\n\tfma.rn.f32 \t%r244, %r182, %r196, %r235;\n\tfma.rn.f32 \t%r245, %r182, %r197, %r234;\n\tfma.rn.f32 \t%r246, %r188, %r196, %r233;\n\tfma.rn.f32 \t%r247, %r188, %r197, %r232;\n\tfma.rn.f32 \t%r248, %r189, %r199, %r247;\n\tfma.rn.f32 \t%r249, %r189, %r198, %r246;\n\tfma.rn.f32 \t%r250, %r183, %r199, %r245;\n\tfma.rn.f32 \t%r251, %r183, %r198, %r244;\n\tfma.rn.f32 \t%r252, %r177, %r199, %r243;\n\tfma.rn.f32 \t%r253, %r177, %r198, %r242;\n\tfma.rn.f32 \t%r254, %r171, %r199, %r241;\n\tfma.rn.f32 \t%r255, %r171, %r198, %r240;\n\tfma.rn.f32 \t%r256, %r172, %r200, %r255;\n\tfma.rn.f32 \t%r257, %r172, %r201, %r254;\n\tfma.rn.f32 \t%r258, %r178, %r200, %r253;\n\tfma.rn.f32 \t%r259, %r178, %r201, %r252;\n\tfma.rn.f32 \t%r260, %r184, %r200, %r251;\n\tfma.rn.f32 \t%r261, %r184, %r201, %r250;\n\tfma.rn.f32 \t%r262, %r190, %r200, %r249;\n\tfma.rn.f32 \t%r263, %r190, %r201, %r248;\n\tfma.rn.f32 \t%r264, %r191, %r203, %r263;\n\tfma.rn.f32 \t%r265, %r191, %r202, %r262;\n\tfma.rn.f32 \t%r266, %r185, %r203, %r261;\n\tfma.rn.f32 \t%r267, %r185, %r202, %r260;\n\tfma.rn.f32 \t%r268, %r179, %r203, %r259;\n\tfma.rn.f32 \t%r269, %r179, %r202, %r258;\n\tfma.rn.f32 \t%r270, %r173, %r203, %r257;\n\tfma.rn.f32 \t%r271, %r173, %r202, %r256;\n\tfma.rn.f32 \t%r272, %r130, %r204, %r271;\n\tfma.rn.f32 \t%r273, %r130, %r205, %r270;\n\tfma.rn.f32 \t%r274, %r140, %r204, %r269;\n\tfma.rn.f32 \t%r275, %r140, %r205, %r268;\n\tfma.rn.f32 \t%r276, %r150, %r204, %r267;\n\tfma.rn.f32 \t%r277, %r150, %r205, %r266;\n\tfma.rn.f32 \t%r278, %r160, %r204, %r265;\n\tfma.rn.f32 \t%r279, %r160, %r205, %r264;\n\tfma.rn.f32 \t%r280, %r161, %r207, %r279;\n\tfma.rn.f32 \t%r281, %r161, %r206, %r278;\n\tfma.rn.f32 \t%r282, %r151, %r207, %r277;\n\tfma.rn.f32 \t%r283, %r151, %r206, %r276;\n\tfma.rn.f32 \t%r284, %r141, %r207, %r275;\n\tfma.rn.f32 \t%r285, %r141, %r206, %r274;\n\tfma.rn.f32 \t%r286, %r131, %r207, %r273;\n\tfma.rn.f32 \t%r287, %r131, %r206, %r272;\n\tfma.rn.f32 \t%r288, %r128, %r208, %r287;\n\tfma.rn.f32 \t%r289, %r128, %r209, %r286;\n\tfma.rn.f32 \t%r290, %r138, %r208, %r285;\n\tfma.rn.f32 \t%r291, %r138, %r209, %r284;\n\tfma.rn.f32 \t%r292, %r148, %r208, %r283;\n\tfma.rn.f32 \t%r293, %r148, %r209, %r282;\n\tfma.rn.f32 \t%r294, %r158, %r208, %r281;\n\tfma.rn.f32 \t%r295, %r158, %r209, %r280;\n\tfma.rn.f32 \t%r296, %r159, %r211, %r295;\n\tfma.rn.f32 \t%r297, %r159, %r210, %r294;\n\tfma.rn.f32 \t%r298, %r149, %r211, %r293;\n\tfma.rn.f32 \t%r299, %r149, %r210, %r292;\n\tfma.rn.f32 \t%r300, %r139, %r211, %r291;\n\tfma.rn.f32 \t%r301, %r139, %r210, %r290;\n\tfma.rn.f32 \t%r302, %r129, %r211, %r289;\n\tfma.rn.f32 \t%r303, %r129, %r210, %r288;\n\tfma.rn.f32 \t%r304, %r134, %r212, %r303;\n\tfma.rn.f32 \t%r305, %r134, %r213, %r302;\n\tfma.rn.f32 \t%r306, %r144, %r212, %r301;\n\tfma.rn.f32 \t%r307, %r144, %r213, %r300;\n\tfma.rn.f32 \t%r308, %r154, %r212, %r299;\n\tfma.rn.f32 \t%r309, %r154, %r213, %r298;\n\tfma.rn.f32 \t%r310, %r164, %r212, %r297;\n\tfma.rn.f32 \t%r311, %r164, %r213, %r296;\n\tfma.rn.f32 \t%r312, %r165, %r215, %r311;\n\tfma.rn.f32 \t%r313, %r165, %r214, %r310;\n\tfma.rn.f32 \t%r314, %r155, %r215, %r309;\n\tfma.rn.f32 \t%r315, %r155, %r214, %r308;\n\tfma.rn.f32 \t%r316, %r145, %r215, %r307;\n\tfma.rn.f32 \t%r317, %r145, %r214, %r306;\n\tfma.rn.f32 \t%r318, %r135, %r215, %r305;\n\tfma.rn.f32 \t%r319, %r135, %r214, %r304;\n\tfma.rn.f32 \t%r320, %r132, %r216, %r319;\n\tfma.rn.f32 \t%r321, %r132, %r217, %r318;\n\tfma.rn.f32 \t%r322, %r142, %r216, %r317;\n\tfma.rn.f32 \t%r323, %r142, %r217, %r316;\n\tfma.rn.f32 \t%r324, %r152, %r216, %r315;\n\tfma.rn.f32 \t%r325, %r152, %r217, %r314;\n\tfma.rn.f32 \t%r326, %r162, %r216, %r313;\n\tfma.rn.f32 \t%r327, %r162, %r217, %r312;\n\tfma.rn.f32 \t%r328, %r163, %r219, %r327;\n\tfma.rn.f32 \t%r329, %r163, %r218, %r326;\n\tfma.rn.f32 \t%r330, %r153, %r219, %r325;\n\tfma.rn.f32 \t%r331, %r153, %r218, %r324;\n\tfma.rn.f32 \t%r332, %r143, %r219, %r323;\n\tfma.rn.f32 \t%r333, %r143, %r218, %r322;\n\tfma.rn.f32 \t%r334, %r133, %r219, %r321;\n\tfma.rn.f32 \t%r335, %r133, %r218, %r320;\n\tfma.rn.f32 \t%r336, %r136, %r220, %r335;\n\tfma.rn.f32 \t%r337, %r136, %r221, %r334;\n\tfma.rn.f32 \t%r338, %r146, %r220, %r333;\n\tfma.rn.f32 \t%r339, %r146, %r221, %r332;\n\tfma.rn.f32 \t%r340, %r156, %r220, %r331;\n\tfma.rn.f32 \t%r341, %r156, %r221, %r330;\n\tfma.rn.f32 \t%r342, %r166, %r220, %r329;\n\tfma.rn.f32 \t%r343, %r166, %r221, %r328;\n\tfma.rn.f32 \t%r392, %r167, %r223, %r343;\n\tfma.rn.f32 \t%r391, %r167, %r222, %r342;\n\tfma.rn.f32 \t%r390, %r157, %r223, %r341;\n\tfma.rn.f32 \t%r389, %r157, %r222, %r340;\n\tfma.rn.f32 \t%r388, %r147, %r223, %r339;\n\tfma.rn.f32 \t%r387, %r147, %r222, %r338;\n\tfma.rn.f32 \t%r386, %r137, %r223, %r337;\n\tfma.rn.f32 \t%r385, %r137, %r222, %r336;\n\t.loc\t1 68 18 // test_complex_kernels.py:68:18\n\tadd.s64 \t%rd20, %rd20, 32;\n\t.loc\t1 64 22 // test_complex_kernels.py:64:22\n\tadd.s64 \t%rd19, %rd19, %rd3;\n\tadd.s32 \t%r384, %r384, -1;\n\tadd.s32 \t%r383, %r383, -16;\n\tsetp.ne.s32 \t%p4, %r384, 0;\n\t@%p4 bra \t$L__BB0_3;\n// %bb.4: // %._crit_edge.loopexit\n\t.loc\t1 70 23 // test_complex_kernels.py:70:23\n\tcvt.rn.f16.f32 \t%rs17, %r385;\n\tcvt.rn.f16.f32 \t%rs18, %r386;\n\tmov.b32 \t%r394, {%rs17, %rs18};\n\tcvt.rn.f16.f32 \t%rs19, %r387;\n\tcvt.rn.f16.f32 \t%rs20, %r388;\n\tmov.b32 \t%r395, {%rs19, %rs20};\n\tcvt.rn.f16.f32 \t%rs21, %r389;\n\tcvt.rn.f16.f32 \t%rs22, %r390;\n\tmov.b32 \t%r396, {%rs21, %rs22};\n\tcvt.rn.f16.f32 \t%rs23, %r391;\n\tcvt.rn.f16.f32 \t%rs24, %r392;\n\tmov.b32 \t%r397, {%rs23, %rs24};\n\tbra.uni \t$L__BB0_5;\n$L__BB0_1: // %.._crit_edge_crit_edge\n\t.loc\t1 76 21 // test_complex_kernels.py:76:21\n\tshl.b32 \t%r393, %r1, 4;\n\tmov.b32 \t%r394, 0;\n\tmov.b32 \t%r395, %r394;\n\tmov.b32 \t%r396, %r394;\n\tmov.b32 \t%r397, %r394;\n$L__BB0_5: // %._crit_edge\n\t.loc\t1 74 33 // test_complex_kernels.py:74:33\n\tmul.lo.s32 \t%r348, %r5, %r47;\n\t.loc\t1 74 21 // test_complex_kernels.py:74:21\n\tmul.wide.s32 \t%rd16, %r348, 2;\n\tadd.s64 \t%rd17, %rd10, %rd16;\n\t.loc\t1 74 52 // test_complex_kernels.py:74:52\n\tmul.wide.s32 \t%rd18, %r6, 2;\n\tadd.s64 \t%rd15, %rd17, %rd18;\n\t.loc\t1 75 33 // test_complex_kernels.py:75:33\n\tsetp.lt.s32 \t%p6, %r5, %r42;\n\t.loc\t1 75 58 // test_complex_kernels.py:75:58\n\tsetp.lt.s32 \t%p7, %r6, %r43;\n\t.loc\t1 75 39 // test_complex_kernels.py:75:39\n\tand.pred \t%p5, %p6, %p7;\n\t.loc\t1 76 21 // test_complex_kernels.py:76:21\n\tbar.sync \t0;\n\tand.b32 \t%r349, %r393, 64;\n\tand.b32 \t%r350, %r1, 3;\n\tshl.b32 \t%r351, %r350, 2;\n\tor.b32 \t%r352, %r349, %r351;\n\tshl.b32 \t%r353, %r1, 1;\n\tand.b32 \t%r354, %r353, 48;\n\tor.b32 \t%r355, %r352, %r354;\n\tadd.s32 \t%r357, %r382, %r355;\n\tst.shared.b32 \t[%r357], %r394;\n\txor.b32 \t%r358, %r355, 132;\n\tadd.s32 \t%r359, %r382, %r358;\n\tst.shared.b32 \t[%r359], %r395;\n\txor.b32 \t%r360, %r355, 264;\n\tadd.s32 \t%r361, %r382, %r360;\n\tst.shared.b32 \t[%r361], %r396;\n\txor.b32 \t%r362, %r355, 396;\n\tadd.s32 \t%r363, %r382, %r362;\n\tst.shared.b32 \t[%r363], %r397;\n\tbar.sync \t0;\n\tshl.b32 \t%r364, %r350, 6;\n\tand.b32 \t%r365, %r1, 16;\n\tshl.b32 \t%r366, %r365, 4;\n\tor.b32 \t%r367, %r364, %r366;\n\tshl.b32 \t%r368, %r1, 2;\n\tand.b32 \t%r369, %r368, 48;\n\tor.b32 \t%r370, %r367, %r369;\n\tand.b32 \t%r371, %r353, 4;\n\tor.b32 \t%r372, %r370, %r371;\n\tshr.u32 \t%r373, %r365, 1;\n\tor.b32 \t%r374, %r372, %r373;\n\tadd.s32 \t%r375, %r382, %r374;\n\tld.shared.b32 \t%r344, [%r375];\n\txor.b32 \t%r376, %r374, 4;\n\tadd.s32 \t%r377, %r382, %r376;\n\tld.shared.b32 \t%r345, [%r377];\n\txor.b32 \t%r378, %r374, 8;\n\tadd.s32 \t%r379, %r382, %r378;\n\tld.shared.b32 \t%r346, [%r379];\n\txor.b32 \t%r380, %r374, 12;\n\tadd.s32 \t%r381, %r382, %r380;\n\tld.shared.b32 \t%r347, [%r381];\n\t// begin inline asm\n\t@%p5 st.global.v4.b32 [ %rd15 + 0 ], { %r344, %r345, %r346, %r347 };\n\t// end inline asm\n\t.loc\t1 76 4 // test_complex_kernels.py:76:4\n\tret;\n$L__tmp6:\n$L__func_end0:\n // -- End function\n}\n\t.file\t1 \"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\"\n\t.file\t2 \"/scratch/findhao/pta/triton/python/triton/language/standard.py\"\n\t.section\t.debug_abbrev\n\t{\n.b8 1 // Abbreviation Code\n.b8 17 // DW_TAG_compile_unit\n.b8 1 // DW_CHILDREN_yes\n.b8 37 // DW_AT_producer\n.b8 8 // DW_FORM_string\n.b8 19 // DW_AT_language\n.b8 5 // DW_FORM_data2\n.b8 3 // DW_AT_name\n.b8 8 // DW_FORM_string\n.b8 16 // DW_AT_stmt_list\n.b8 6 // DW_FORM_data4\n.b8 27 // DW_AT_comp_dir\n.b8 8 // DW_FORM_string\n.b8 0 // EOM(1)\n.b8 0 // EOM(2)\n.b8 2 // Abbreviation Code\n.b8 46 // DW_TAG_subprogram\n.b8 0 // DW_CHILDREN_no\n.b8 3 // DW_AT_name\n.b8 8 // DW_FORM_string\n.b8 32 // DW_AT_inline\n.b8 11 // DW_FORM_data1\n.b8 0 // EOM(1)\n.b8 0 // EOM(2)\n.b8 3 // Abbreviation Code\n.b8 46 // DW_TAG_subprogram\n.b8 1 // DW_CHILDREN_yes\n.b8 17 // DW_AT_low_pc\n.b8 1 // DW_FORM_addr\n.b8 18 // DW_AT_high_pc\n.b8 1 // DW_FORM_addr\n.b8 49 // DW_AT_abstract_origin\n.b8 19 // DW_FORM_ref4\n.b8 0 // EOM(1)\n.b8 0 // EOM(2)\n.b8 4 // Abbreviation Code\n.b8 29 // DW_TAG_inlined_subroutine\n.b8 0 // DW_CHILDREN_no\n.b8 49 // DW_AT_abstract_origin\n.b8 19 // DW_FORM_ref4\n.b8 17 // DW_AT_low_pc\n.b8 1 // DW_FORM_addr\n.b8 18 // DW_AT_high_pc\n.b8 1 // DW_FORM_addr\n.b8 88 // DW_AT_call_file\n.b8 11 // DW_FORM_data1\n.b8 89 // DW_AT_call_line\n.b8 11 // DW_FORM_data1\n.b8 87 // DW_AT_call_column\n.b8 11 // DW_FORM_data1\n.b8 0 // EOM(1)\n.b8 0 // EOM(2)\n.b8 0 // EOM(3)\n\t}\n\t.section\t.debug_info\n\t{\n.b32 191 // Length of Unit\n.b8 2 // DWARF version number\n.b8 0\n.b32 .debug_abbrev // Offset Into Abbrev. Section\n.b8 8 // Address Size (in bytes)\n.b8 1 // Abbrev [1] 0xb:0xb8 DW_TAG_compile_unit\n.b8 116 // DW_AT_producer\n.b8 114\n.b8 105\n.b8 116\n.b8 111\n.b8 110\n.b8 0\n.b8 2 // DW_AT_language\n.b8 0\n.b8 116 // DW_AT_name\n.b8 101\n.b8 115\n.b8 116\n.b8 95\n.b8 99\n.b8 111\n.b8 109\n.b8 112\n.b8 108\n.b8 101\n.b8 120\n.b8 95\n.b8 107\n.b8 101\n.b8 114\n.b8 110\n.b8 101\n.b8 108\n.b8 115\n.b8 46\n.b8 112\n.b8 121\n.b8 0\n.b32 .debug_line // DW_AT_stmt_list\n.b8 47 // DW_AT_comp_dir\n.b8 115\n.b8 99\n.b8 114\n.b8 97\n.b8 116\n.b8 99\n.b8 104\n.b8 47\n.b8 102\n.b8 105\n.b8 110\n.b8 100\n.b8 104\n.b8 97\n.b8 111\n.b8 47\n.b8 116\n.b8 114\n.b8 105\n.b8 116\n.b8 111\n.b8 110\n.b8 112\n.b8 97\n.b8 114\n.b8 115\n.b8 101\n.b8 47\n.b8 116\n.b8 101\n.b8 115\n.b8 116\n.b8 115\n.b8 0\n.b8 2 // Abbrev [2] 0x54:0x10 DW_TAG_subprogram\n.b8 109 // DW_AT_name\n.b8 97\n.b8 116\n.b8 109\n.b8 117\n.b8 108\n.b8 95\n.b8 107\n.b8 101\n.b8 114\n.b8 110\n.b8 101\n.b8 108\n.b8 0\n.b8 1 // DW_AT_inline\n.b8 3 // Abbrev [3] 0x64:0x5e DW_TAG_subprogram\n.b64 $L__func_begin0 // DW_AT_low_pc\n.b64 $L__func_end0 // DW_AT_high_pc\n.b32 84 // DW_AT_abstract_origin\n.b8 4 // Abbrev [4] 0x79:0x18 DW_TAG_inlined_subroutine\n.b32 84 // DW_AT_abstract_origin\n.b64 $L__tmp1 // DW_AT_low_pc\n.b64 $L__tmp2 // DW_AT_high_pc\n.b8 1 // DW_AT_call_file\n.b8 48 // DW_AT_call_line\n.b8 27 // DW_AT_call_column\n.b8 4 // Abbrev [4] 0x91:0x18 DW_TAG_inlined_subroutine\n.b32 84 // DW_AT_abstract_origin\n.b64 $L__tmp2 // DW_AT_low_pc\n.b64 $L__tmp3 // DW_AT_high_pc\n.b8 1 // DW_AT_call_file\n.b8 49 // DW_AT_call_line\n.b8 27 // DW_AT_call_column\n.b8 4 // Abbrev [4] 0xa9:0x18 DW_TAG_inlined_subroutine\n.b32 84 // DW_AT_abstract_origin\n.b64 $L__tmp4 // DW_AT_low_pc\n.b64 $L__tmp5 // DW_AT_high_pc\n.b8 1 // DW_AT_call_file\n.b8 64 // DW_AT_call_line\n.b8 33 // DW_AT_call_column\n.b8 0 // End Of Children Mark\n.b8 0 // End Of Children Mark\n\t}\n\t.section\t.debug_macinfo\t{\t}\n","matmul_kernel.json":"{\"hash\": \"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00\", \"target\": {\"backend\": \"cuda\", \"arch\": 75, \"warp_size\": 32}, \"num_warps\": 1, \"num_ctas\": 1, \"num_stages\": 1, \"warp_size\": 32, \"maxnreg\": null, \"cluster_dims\": [1, 1, 1], \"ptx_version\": null, \"ptx_options\": null, \"ir_override\": null, \"enable_fp_fusion\": true, \"launch_cooperative_grid\": false, \"launch_pdl\": false, \"supported_fp8_dtypes\": [\"fp8e4b15\", \"fp8e5\"], \"deprecated_fp8_dot_operand_dtypes\": [], \"default_dot_input_precision\": \"tf32\", \"allowed_dot_input_precisions\": [\"tf32\", \"tf32x3\", \"ieee\"], \"max_num_imprecise_acc_default\": 0, \"extern_libs\": [[\"libdevice\", \"/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc\"]], \"debug\": false, \"backend_name\": \"cuda\", \"sanitize_overflow\": true, \"arch\": \"sm75\", \"triton_version\": \"3.4.0\", \"tensordesc_meta\": [], \"shared\": 1024, \"tmem_size\": 0, \"global_scratch_size\": 0, \"global_scratch_align\": 1, \"name\": \"matmul_kernel\"}"},"python_source":{"file_path":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","start_line":29,"end_line":77,"code":"@triton.autotune(\n configs=[\n triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 1}, num_stages=1, num_warps=1),\n triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 1}, num_stages=1, num_warps=1),\n triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 1}, num_stages=1, num_warps=1),\n ],\n key=['M', 'N', 'K'],\n)\n@triton.jit\ndef matmul_kernel(\n a_ptr, b_ptr, c_ptr,\n M, N, K,\n stride_am, stride_ak,\n stride_bk, stride_bn,\n stride_cm, stride_cn,\n BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,\n GROUP_SIZE_M: tl.constexpr,\n):\n pid = tl.program_id(axis=0)\n num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)\n num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)\n num_pid_in_group = GROUP_SIZE_M * num_pid_n\n group_id = pid // num_pid_in_group\n first_pid_m = group_id * GROUP_SIZE_M\n group_size = min(num_pid_m - first_pid_m, GROUP_SIZE_M)\n pid_m = first_pid_m + (pid % group_size)\n pid_n = (pid % num_pid_in_group) // group_size\n\n offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M\n offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N\n offs_k = tl.arange(0, BLOCK_SIZE_K)\n a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)\n b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)\n\n accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)\n for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):\n a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)\n b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)\n accumulator += tl.dot(a, b)\n a_ptrs += BLOCK_SIZE_K * stride_ak\n b_ptrs += BLOCK_SIZE_K * stride_bk\n c = accumulator.to(tl.float16)\n\n offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)\n offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)\n c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]\n c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)\n tl.store(c_ptrs, c, mask=c_mask)\n"},"times":{"ir_initialization":4433439,"lowering_stages":[],"store_results":0}}} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":null,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":149,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"compilation","pid":171439,"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":149,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":593,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel = self._do_compile(key, signature, device, constexprs, options, attrs, warmup)"},{"line":773,"name":"_do_compile","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel = self.compile(src, target=target, options=options.__dict__)"},{"line":267,"name":"compile","filename":"/scratch/findhao/pta/triton/python/triton/compiler/compiler.py","loc":"compilation_listener("},{"line":752,"name":"maybe_trace_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton("},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ","payload":{"metadata":{"cache_hit":true,"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32,"env":{},"src_attrs":{"(0,)":[["tt.divisibility",16]],"(1,)":[["tt.divisibility",16]],"(2,)":[["tt.divisibility",16]],"(3,)":[["tt.divisibility",16]],"(4,)":[["tt.divisibility",16]],"(5,)":[["tt.divisibility",16]],"(6,)":[["tt.divisibility",16]],"(8,)":[["tt.divisibility",16]],"(10,)":[["tt.divisibility",16]]},"src_cache_key":"5aec8bef23533ced7a4a2dea17fb314b1446b68a9ca72aa80e32caf75b768172","src_constants":{"(7,)":1,"(9,)":1,"(11,)":1,"(12,)":32,"(13,)":16,"(14,)":16,"(15,)":1}},"file_path":{"matmul_kernel.source":"/home/findhao/.triton/cache/4P4RCZ4YCG7ZMATVWAKTUA6IDOTMKLLE7DQYCJMN4KIH4IDUSCTQ/matmul_kernel.source","matmul_kernel.ttir":"/home/findhao/.triton/cache/4P4RCZ4YCG7ZMATVWAKTUA6IDOTMKLLE7DQYCJMN4KIH4IDUSCTQ/matmul_kernel.ttir","matmul_kernel.ttgir":"/home/findhao/.triton/cache/4P4RCZ4YCG7ZMATVWAKTUA6IDOTMKLLE7DQYCJMN4KIH4IDUSCTQ/matmul_kernel.ttgir","matmul_kernel.llir":"/home/findhao/.triton/cache/4P4RCZ4YCG7ZMATVWAKTUA6IDOTMKLLE7DQYCJMN4KIH4IDUSCTQ/matmul_kernel.llir","matmul_kernel.ptx":"/home/findhao/.triton/cache/4P4RCZ4YCG7ZMATVWAKTUA6IDOTMKLLE7DQYCJMN4KIH4IDUSCTQ/matmul_kernel.ptx","matmul_kernel.cubin":"/home/findhao/.triton/cache/4P4RCZ4YCG7ZMATVWAKTUA6IDOTMKLLE7DQYCJMN4KIH4IDUSCTQ/matmul_kernel.cubin","matmul_kernel.json":"/home/findhao/.triton/cache/4P4RCZ4YCG7ZMATVWAKTUA6IDOTMKLLE7DQYCJMN4KIH4IDUSCTQ/matmul_kernel.json"},"file_content":{"matmul_kernel.ttir":"#loc = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0)\nmodule {\n tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg1: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg2: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg3: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg4: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg5: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg6: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg7: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg8: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0)) attributes {noinline = false} {\n %c15_i32 = arith.constant 15 : i32 loc(#loc1)\n %c31_i32 = arith.constant 31 : i32 loc(#loc1)\n %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16> loc(#loc1)\n %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x16xf16> loc(#loc1)\n %c0_i32 = arith.constant 0 : i32 loc(#loc1)\n %cst_1 = arith.constant dense<16> : tensor<32x16xi32> loc(#loc1)\n %cst_2 = arith.constant dense<0.000000e+00> : tensor<32x16xf32> loc(#loc1)\n %c16_i32 = arith.constant 16 : i32 loc(#loc1)\n %c32_i32 = arith.constant 32 : i32 loc(#loc1)\n %c1_i32 = arith.constant 1 : i32 loc(#loc1)\n %0 = tt.get_program_id x : i32 loc(#loc2)\n %1 = arith.addi %arg3, %c31_i32 : i32 loc(#loc56)\n %2 = arith.divsi %1, %c32_i32 : i32 loc(#loc57)\n %3 = arith.addi %arg4, %c15_i32 : i32 loc(#loc58)\n %4 = arith.divsi %3, %c16_i32 : i32 loc(#loc59)\n %5 = arith.divsi %0, %4 : i32 loc(#loc7)\n %6 = arith.subi %2, %5 : i32 loc(#loc8)\n %7 = arith.minsi %6, %c1_i32 : i32 loc(#loc9)\n %8 = arith.remsi %0, %7 : i32 loc(#loc10)\n %9 = arith.addi %5, %8 : i32 loc(#loc11)\n %10 = arith.remsi %0, %4 : i32 loc(#loc12)\n %11 = arith.divsi %10, %7 : i32 loc(#loc13)\n %12 = arith.muli %9, %c32_i32 : i32 loc(#loc14)\n %13 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc15)\n %14 = tt.splat %12 : i32 -> tensor<32xi32> loc(#loc16)\n %15 = arith.addi %14, %13 : tensor<32xi32> loc(#loc16)\n %16 = tt.splat %arg3 : i32 -> tensor<32xi32> loc(#loc17)\n %17 = arith.remsi %15, %16 : tensor<32xi32> loc(#loc17)\n %18 = arith.muli %11, %c16_i32 : i32 loc(#loc18)\n %19 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc19)\n %20 = tt.splat %18 : i32 -> tensor<16xi32> loc(#loc20)\n %21 = arith.addi %20, %19 : tensor<16xi32> loc(#loc20)\n %22 = tt.splat %arg4 : i32 -> tensor<16xi32> loc(#loc21)\n %23 = arith.remsi %21, %22 : tensor<16xi32> loc(#loc21)\n %24 = tt.expand_dims %17 {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc22)\n %25 = tt.splat %arg6 : i32 -> tensor<32x1xi32> loc(#loc23)\n %26 = arith.muli %24, %25 : tensor<32x1xi32> loc(#loc23)\n %27 = tt.expand_dims %19 {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc24)\n %28 = tt.broadcast %26 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc25)\n %29 = tt.broadcast %27 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc25)\n %30 = arith.addi %28, %29 : tensor<32x16xi32> loc(#loc25)\n %31 = tt.splat %arg0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc26)\n %32 = tt.addptr %31, %30 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc26)\n %33 = tt.expand_dims %19 {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc27)\n %34 = tt.splat %arg7 : i32 -> tensor<16x1xi32> loc(#loc28)\n %35 = arith.muli %33, %34 : tensor<16x1xi32> loc(#loc28)\n %36 = tt.expand_dims %23 {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc29)\n %37 = tt.broadcast %35 : tensor<16x1xi32> -> tensor<16x16xi32> loc(#loc30)\n %38 = tt.broadcast %36 : tensor<1x16xi32> -> tensor<16x16xi32> loc(#loc30)\n %39 = arith.addi %37, %38 : tensor<16x16xi32> loc(#loc30)\n %40 = tt.splat %arg1 : !tt.ptr -> tensor<16x16x!tt.ptr> loc(#loc31)\n %41 = tt.addptr %40, %39 : tensor<16x16x!tt.ptr>, tensor<16x16xi32> loc(#loc31)\n %42 = arith.addi %arg5, %c15_i32 : i32 loc(#loc60)\n %43 = arith.divsi %42, %c16_i32 : i32 loc(#loc61)\n %44:3 = scf.for %arg9 = %c0_i32 to %43 step %c1_i32 iter_args(%arg10 = %32, %arg11 = %41, %arg12 = %cst_2) -> (tensor<32x16x!tt.ptr>, tensor<16x16x!tt.ptr>, tensor<32x16xf32>) : i32 {\n %62 = arith.muli %arg9, %c16_i32 : i32 loc(#loc34)\n %63 = arith.subi %arg5, %62 : i32 loc(#loc35)\n %64 = tt.splat %63 : i32 -> tensor<1x16xi32> loc(#loc36)\n %65 = arith.cmpi slt, %27, %64 : tensor<1x16xi32> loc(#loc36)\n %66 = tt.broadcast %65 : tensor<1x16xi1> -> tensor<32x16xi1> loc(#loc37)\n %67 = tt.load %arg10, %66, %cst_0 : tensor<32x16x!tt.ptr> loc(#loc37)\n %68 = tt.splat %63 : i32 -> tensor<16x1xi32> loc(#loc38)\n %69 = arith.cmpi slt, %33, %68 : tensor<16x1xi32> loc(#loc38)\n %70 = tt.broadcast %69 : tensor<16x1xi1> -> tensor<16x16xi1> loc(#loc39)\n %71 = tt.load %arg11, %70, %cst : tensor<16x16x!tt.ptr> loc(#loc39)\n %72 = tt.dot %67, %71, %arg12, inputPrecision = tf32 : tensor<32x16xf16> * tensor<16x16xf16> -> tensor<32x16xf32> loc(#loc40)\n %73 = tt.addptr %arg10, %cst_1 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc41)\n %74 = arith.muli %arg7, %c16_i32 : i32 loc(#loc42)\n %75 = tt.splat %74 : i32 -> tensor<16x16xi32> loc(#loc43)\n %76 = tt.addptr %arg11, %75 : tensor<16x16x!tt.ptr>, tensor<16x16xi32> loc(#loc43)\n scf.yield %73, %76, %72 : tensor<32x16x!tt.ptr>, tensor<16x16x!tt.ptr>, tensor<32x16xf32> loc(#loc44)\n } loc(#loc33)\n %45 = arith.truncf %44#2 : tensor<32x16xf32> to tensor<32x16xf16> loc(#loc45)\n %46 = tt.expand_dims %15 {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc46)\n %47 = tt.splat %arg8 : i32 -> tensor<32x1xi32> loc(#loc47)\n %48 = arith.muli %47, %46 : tensor<32x1xi32> loc(#loc47)\n %49 = tt.splat %arg2 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc48)\n %50 = tt.addptr %49, %48 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc48)\n %51 = tt.expand_dims %21 {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc49)\n %52 = tt.broadcast %50 : tensor<32x1x!tt.ptr> -> tensor<32x16x!tt.ptr> loc(#loc50)\n %53 = tt.broadcast %51 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc50)\n %54 = tt.addptr %52, %53 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc50)\n %55 = tt.splat %arg3 : i32 -> tensor<32x1xi32> loc(#loc51)\n %56 = arith.cmpi slt, %46, %55 : tensor<32x1xi32> loc(#loc51)\n %57 = tt.splat %arg4 : i32 -> tensor<1x16xi32> loc(#loc52)\n %58 = arith.cmpi slt, %51, %57 : tensor<1x16xi32> loc(#loc52)\n %59 = tt.broadcast %56 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc53)\n %60 = tt.broadcast %58 : tensor<1x16xi1> -> tensor<32x16xi1> loc(#loc53)\n %61 = arith.andi %59, %60 : tensor<32x16xi1> loc(#loc53)\n tt.store %54, %45, %61 : tensor<32x16x!tt.ptr> loc(#loc54)\n tt.return loc(#loc55)\n } loc(#loc)\n} loc(#loc)\n#loc1 = loc(unknown)\n#loc2 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":47:24)\n#loc3 = loc(\"/scratch/findhao/pta/triton/python/triton/language/standard.py\":40:22)\n#loc4 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":48:27)\n#loc5 = loc(\"/scratch/findhao/pta/triton/python/triton/language/standard.py\":40:28)\n#loc6 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":49:27)\n#loc7 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":51:22)\n#loc8 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":53:33)\n#loc9 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":53:46)\n#loc10 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":54:33)\n#loc11 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":54:27)\n#loc12 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":55:19)\n#loc13 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":55:40)\n#loc14 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:23)\n#loc15 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:51)\n#loc16 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:38)\n#loc17 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:68)\n#loc18 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:23)\n#loc19 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:51)\n#loc20 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:38)\n#loc21 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:68)\n#loc22 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:30)\n#loc23 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:41)\n#loc24 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:60)\n#loc25 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:53)\n#loc26 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:22)\n#loc27 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:29)\n#loc28 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:40)\n#loc29 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:60)\n#loc30 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:52)\n#loc31 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:22)\n#loc32 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":64:33)\n#loc33 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":64:22)\n#loc34 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:59)\n#loc35 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:55)\n#loc36 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:51)\n#loc37 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:20)\n#loc38 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":66:51)\n#loc39 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":66:20)\n#loc40 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":67:33)\n#loc41 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":68:18)\n#loc42 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:33)\n#loc43 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:18)\n#loc44 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:8)\n#loc45 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":70:23)\n#loc46 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:41)\n#loc47 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:33)\n#loc48 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:21)\n#loc49 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:72)\n#loc50 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:52)\n#loc51 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:33)\n#loc52 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:58)\n#loc53 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:39)\n#loc54 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":76:21)\n#loc55 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":76:4)\n#loc56 = loc(callsite(#loc3 at #loc4))\n#loc57 = loc(callsite(#loc5 at #loc4))\n#loc58 = loc(callsite(#loc3 at #loc6))\n#loc59 = loc(callsite(#loc5 at #loc6))\n#loc60 = loc(callsite(#loc3 at #loc32))\n#loc61 = loc(callsite(#loc5 at #loc32))\n","matmul_kernel.ttgir":"#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [16, 2], warpsPerCTA = [1, 1], order = [1, 0]}>\n#blocked1 = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0]}>\n#loc = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0)\n#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>\n#smem = #ttg.shared_memory\nmodule attributes {\"ttg.num-ctas\" = 1 : i32, \"ttg.num-warps\" = 1 : i32, ttg.target = \"cuda:75\", \"ttg.threads-per-warp\" = 32 : i32} {\n tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg1: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg2: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg3: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg4: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg5: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg6: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg7: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg8: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0)) attributes {noinline = false} {\n %cst = arith.constant dense<16> : tensor<32x16xi32, #blocked> loc(#loc1)\n %c0_i32 = arith.constant 0 : i32 loc(#loc1)\n %c31_i32 = arith.constant 31 : i32 loc(#loc1)\n %c15_i32 = arith.constant 15 : i32 loc(#loc1)\n %c16_i32 = arith.constant 16 : i32 loc(#loc1)\n %c32_i32 = arith.constant 32 : i32 loc(#loc1)\n %c1_i32 = arith.constant 1 : i32 loc(#loc1)\n %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x16xf16, #blocked> loc(#loc1)\n %cst_1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #blocked> loc(#loc1)\n %cst_2 = arith.constant dense<0.000000e+00> : tensor<32x16xf32, #blocked1> loc(#loc1)\n %0 = tt.get_program_id x : i32 loc(#loc2)\n %1 = arith.addi %arg3, %c31_i32 : i32 loc(#loc56)\n %2 = arith.divsi %1, %c32_i32 : i32 loc(#loc57)\n %3 = arith.addi %arg4, %c15_i32 : i32 loc(#loc58)\n %4 = arith.divsi %3, %c16_i32 : i32 loc(#loc59)\n %5 = arith.divsi %0, %4 : i32 loc(#loc7)\n %6 = arith.subi %2, %5 : i32 loc(#loc8)\n %7 = arith.minsi %6, %c1_i32 : i32 loc(#loc9)\n %8 = arith.remsi %0, %7 : i32 loc(#loc10)\n %9 = arith.addi %5, %8 : i32 loc(#loc11)\n %10 = arith.remsi %0, %4 : i32 loc(#loc12)\n %11 = arith.divsi %10, %7 : i32 loc(#loc13)\n %12 = arith.muli %9, %c32_i32 : i32 loc(#loc14)\n %13 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc15)\n %14 = tt.splat %12 : i32 -> tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc16)\n %15 = arith.addi %14, %13 : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc16)\n %16 = tt.splat %arg3 : i32 -> tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc17)\n %17 = arith.remsi %15, %16 : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc17)\n %18 = arith.muli %11, %c16_i32 : i32 loc(#loc18)\n %19 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc19)\n %20 = tt.splat %18 : i32 -> tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc20)\n %21 = arith.addi %20, %19 : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc20)\n %22 = tt.splat %arg4 : i32 -> tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc21)\n %23 = arith.remsi %21, %22 : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc21)\n %24 = tt.expand_dims %17 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc22)\n %25 = tt.splat %arg6 : i32 -> tensor<32x1xi32, #blocked> loc(#loc23)\n %26 = arith.muli %24, %25 : tensor<32x1xi32, #blocked> loc(#loc23)\n %27 = tt.expand_dims %19 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc24)\n %28 = tt.broadcast %26 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc25)\n %29 = tt.broadcast %27 : tensor<1x16xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc25)\n %30 = arith.addi %28, %29 : tensor<32x16xi32, #blocked> loc(#loc25)\n %31 = tt.splat %arg0 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc26)\n %32 = tt.addptr %31, %30 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc26)\n %33 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc27)\n %34 = tt.expand_dims %33 {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> loc(#loc27)\n %35 = tt.splat %arg7 : i32 -> tensor<16x1xi32, #blocked> loc(#loc28)\n %36 = arith.muli %34, %35 : tensor<16x1xi32, #blocked> loc(#loc28)\n %37 = tt.expand_dims %23 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc29)\n %38 = tt.broadcast %36 : tensor<16x1xi32, #blocked> -> tensor<16x16xi32, #blocked> loc(#loc30)\n %39 = tt.broadcast %37 : tensor<1x16xi32, #blocked> -> tensor<16x16xi32, #blocked> loc(#loc30)\n %40 = arith.addi %38, %39 : tensor<16x16xi32, #blocked> loc(#loc30)\n %41 = tt.splat %arg1 : !tt.ptr -> tensor<16x16x!tt.ptr, #blocked> loc(#loc31)\n %42 = tt.addptr %41, %40 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> loc(#loc31)\n %43 = arith.addi %arg5, %c15_i32 : i32 loc(#loc60)\n %44 = arith.divsi %43, %c16_i32 : i32 loc(#loc61)\n %45 = arith.muli %arg7, %c16_i32 : i32 loc(#loc33)\n %46 = tt.splat %45 : i32 -> tensor<16x16xi32, #blocked> loc(#loc34)\n %47:3 = scf.for %arg9 = %c0_i32 to %44 step %c1_i32 iter_args(%arg10 = %cst_2, %arg11 = %32, %arg12 = %42) -> (tensor<32x16xf32, #blocked1>, tensor<32x16x!tt.ptr, #blocked>, tensor<16x16x!tt.ptr, #blocked>) : i32 {\n %66 = arith.muli %arg9, %c16_i32 : i32 loc(#loc36)\n %67 = arith.subi %arg5, %66 : i32 loc(#loc37)\n %68 = tt.splat %67 : i32 -> tensor<1x16xi32, #blocked> loc(#loc38)\n %69 = arith.cmpi slt, %27, %68 : tensor<1x16xi32, #blocked> loc(#loc38)\n %70 = tt.broadcast %69 : tensor<1x16xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc39)\n %71 = tt.load %arg11, %70, %cst_0 : tensor<32x16x!tt.ptr, #blocked> loc(#loc39)\n %72 = tt.splat %67 : i32 -> tensor<16x1xi32, #blocked> loc(#loc40)\n %73 = arith.cmpi slt, %34, %72 : tensor<16x1xi32, #blocked> loc(#loc40)\n %74 = tt.broadcast %73 : tensor<16x1xi1, #blocked> -> tensor<16x16xi1, #blocked> loc(#loc41)\n %75 = tt.load %arg12, %74, %cst_1 : tensor<16x16x!tt.ptr, #blocked> loc(#loc41)\n %76 = arith.extf %71 : tensor<32x16xf16, #blocked> to tensor<32x16xf32, #blocked> loc(#loc42)\n %77 = ttg.local_alloc %76 : (tensor<32x16xf32, #blocked>) -> !ttg.memdesc<32x16xf32, #shared, #smem> loc(#loc42)\n %78 = ttg.local_load %77 : !ttg.memdesc<32x16xf32, #shared, #smem> -> tensor<32x16xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked1}>> loc(#loc42)\n %79 = arith.extf %75 : tensor<16x16xf16, #blocked> to tensor<16x16xf32, #blocked> loc(#loc42)\n %80 = ttg.local_alloc %79 : (tensor<16x16xf32, #blocked>) -> !ttg.memdesc<16x16xf32, #shared, #smem> loc(#loc42)\n %81 = ttg.local_load %80 : !ttg.memdesc<16x16xf32, #shared, #smem> -> tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked1}>> loc(#loc42)\n %82 = tt.dot %78, %81, %arg10, inputPrecision = tf32 : tensor<32x16xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked1}>> * tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked1}>> -> tensor<32x16xf32, #blocked1> loc(#loc42)\n %83 = tt.addptr %arg11, %cst : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc43)\n %84 = tt.addptr %arg12, %46 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> loc(#loc34)\n scf.yield %82, %83, %84 : tensor<32x16xf32, #blocked1>, tensor<32x16x!tt.ptr, #blocked>, tensor<16x16x!tt.ptr, #blocked> loc(#loc44)\n } loc(#loc35)\n %48 = arith.truncf %47#0 : tensor<32x16xf32, #blocked1> to tensor<32x16xf16, #blocked1> loc(#loc45)\n %49 = tt.expand_dims %15 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc46)\n %50 = tt.splat %arg8 : i32 -> tensor<32x1xi32, #blocked> loc(#loc47)\n %51 = arith.muli %50, %49 : tensor<32x1xi32, #blocked> loc(#loc47)\n %52 = tt.splat %arg2 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> loc(#loc48)\n %53 = tt.addptr %52, %51 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> loc(#loc48)\n %54 = tt.expand_dims %21 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc49)\n %55 = tt.broadcast %53 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x16x!tt.ptr, #blocked> loc(#loc50)\n %56 = tt.broadcast %54 : tensor<1x16xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc50)\n %57 = tt.addptr %55, %56 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc50)\n %58 = tt.splat %arg3 : i32 -> tensor<32x1xi32, #blocked> loc(#loc51)\n %59 = arith.cmpi slt, %49, %58 : tensor<32x1xi32, #blocked> loc(#loc51)\n %60 = tt.splat %arg4 : i32 -> tensor<1x16xi32, #blocked> loc(#loc52)\n %61 = arith.cmpi slt, %54, %60 : tensor<1x16xi32, #blocked> loc(#loc52)\n %62 = tt.broadcast %59 : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc53)\n %63 = tt.broadcast %61 : tensor<1x16xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc53)\n %64 = arith.andi %62, %63 : tensor<32x16xi1, #blocked> loc(#loc53)\n %65 = ttg.convert_layout %48 : tensor<32x16xf16, #blocked1> -> tensor<32x16xf16, #blocked> loc(#loc54)\n tt.store %57, %65, %64 : tensor<32x16x!tt.ptr, #blocked> loc(#loc54)\n tt.return loc(#loc55)\n } loc(#loc)\n} loc(#loc)\n#loc1 = loc(unknown)\n#loc2 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":47:24)\n#loc3 = loc(\"/scratch/findhao/pta/triton/python/triton/language/standard.py\":40:22)\n#loc4 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":48:27)\n#loc5 = loc(\"/scratch/findhao/pta/triton/python/triton/language/standard.py\":40:28)\n#loc6 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":49:27)\n#loc7 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":51:22)\n#loc8 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":53:33)\n#loc9 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":53:46)\n#loc10 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":54:33)\n#loc11 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":54:27)\n#loc12 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":55:19)\n#loc13 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":55:40)\n#loc14 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:23)\n#loc15 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:51)\n#loc16 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:38)\n#loc17 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:68)\n#loc18 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:23)\n#loc19 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:51)\n#loc20 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:38)\n#loc21 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:68)\n#loc22 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:30)\n#loc23 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:41)\n#loc24 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:60)\n#loc25 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:53)\n#loc26 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:22)\n#loc27 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:29)\n#loc28 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:40)\n#loc29 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:60)\n#loc30 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:52)\n#loc31 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:22)\n#loc32 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":64:33)\n#loc33 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:33)\n#loc34 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:18)\n#loc35 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":64:22)\n#loc36 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:59)\n#loc37 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:55)\n#loc38 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:51)\n#loc39 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:20)\n#loc40 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":66:51)\n#loc41 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":66:20)\n#loc42 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":67:33)\n#loc43 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":68:18)\n#loc44 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:8)\n#loc45 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":70:23)\n#loc46 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:41)\n#loc47 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:33)\n#loc48 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:21)\n#loc49 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:72)\n#loc50 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:52)\n#loc51 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:33)\n#loc52 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:58)\n#loc53 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:39)\n#loc54 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":76:21)\n#loc55 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":76:4)\n#loc56 = loc(callsite(#loc3 at #loc4))\n#loc57 = loc(callsite(#loc5 at #loc4))\n#loc58 = loc(callsite(#loc3 at #loc6))\n#loc59 = loc(callsite(#loc5 at #loc6))\n#loc60 = loc(callsite(#loc3 at #loc32))\n#loc61 = loc(callsite(#loc5 at #loc32))\n","matmul_kernel.llir":"; ModuleID = 'LLVMDialectModule'\nsource_filename = \"LLVMDialectModule\"\ntarget datalayout = \"e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64\"\n\n@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16\n\ndefine ptx_kernel void @matmul_kernel(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9) local_unnamed_addr #0 !dbg !5 {\n %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8\n %12 = add i32 %3, 31, !dbg !9\n %13 = sdiv i32 %12, 32, !dbg !13\n %14 = add i32 %4, 15, !dbg !14\n %15 = sdiv i32 %14, 16, !dbg !16\n %.frozen = freeze i32 %15, !dbg !17\n %16 = sdiv i32 %11, %.frozen, !dbg !17\n %17 = sub i32 %13, %16, !dbg !18\n %18 = tail call i32 @llvm.smin.i32(i32 %17, i32 1), !dbg !19\n %19 = srem i32 %11, %18, !dbg !20\n %20 = add i32 %19, %16, !dbg !21\n %21 = mul i32 %16, %.frozen, !dbg !22\n %.decomposed = sub i32 %11, %21, !dbg !22\n %22 = sdiv i32 %.decomposed, %18, !dbg !23\n %23 = shl i32 %20, 5, !dbg !24\n %24 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !25\n %25 = lshr i32 %24, 1, !dbg !25\n %26 = and i32 %25, 15, !dbg !25\n %27 = or disjoint i32 %23, %26, !dbg !26\n %28 = or disjoint i32 %27, 16, !dbg !26\n %29 = shl nsw i32 %22, 4, !dbg !27\n %30 = and i32 %24, 1, !dbg !28\n %31 = shl nuw nsw i32 %30, 3, !dbg !28\n %32 = or disjoint i32 %29, %31, !dbg !29\n %33 = add i32 %5, 15, !dbg !30\n %34 = sdiv i32 %33, 16, !dbg !32\n %35 = icmp sgt i32 %33, 15, !dbg !33\n br i1 %35, label %.lr.ph, label %._crit_edge, !dbg !33\n\n.lr.ph: ; preds = %10\n %36 = shl i32 %7, 4, !dbg !34\n %37 = srem i32 %32, %4, !dbg !35\n %38 = mul i32 %7, %26, !dbg !36\n %39 = add i32 %37, %38, !dbg !37\n %40 = sext i32 %39 to i64, !dbg !38\n %41 = getelementptr half, ptr addrspace(1) %1, i64 %40, !dbg !38\n %42 = srem i32 %28, %3, !dbg !39\n %43 = mul i32 %42, %6, !dbg !40\n %44 = add i32 %43, %31, !dbg !41\n %45 = sext i32 %44 to i64, !dbg !42\n %46 = getelementptr half, ptr addrspace(1) %0, i64 %45, !dbg !42\n %47 = srem i32 %27, %3, !dbg !39\n %48 = mul i32 %47, %6, !dbg !40\n %49 = add i32 %48, %31, !dbg !41\n %50 = sext i32 %49 to i64, !dbg !42\n %51 = getelementptr half, ptr addrspace(1) %0, i64 %50, !dbg !42\n %52 = shl nuw nsw i32 %24, 5\n %53 = and i32 %52, 992\n %54 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %53\n %55 = getelementptr inbounds nuw i8, ptr addrspace(3) %54, i32 16\n %56 = getelementptr inbounds nuw i8, ptr addrspace(3) %54, i32 1024\n %57 = getelementptr inbounds nuw i8, ptr addrspace(3) %54, i32 1040\n %58 = shl nuw nsw i32 %24, 6\n %59 = and i32 %58, 1792\n %60 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %59\n %61 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 16\n %62 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 64\n %63 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 80\n %64 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 128\n %65 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 144\n %66 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 192\n %67 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 208\n %68 = shl nuw nsw i32 %24, 4\n %69 = and i32 %68, 48\n %70 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %69\n %71 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 64\n %72 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 128\n %73 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 192\n %74 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 256\n %75 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 320\n %76 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 384\n %77 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 448\n %78 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 512\n %79 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 576\n %80 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 640\n %81 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 704\n %82 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 768\n %83 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 832\n %84 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 896\n %85 = getelementptr inbounds nuw i8, ptr addrspace(3) %70, i32 960\n %86 = sext i32 %36 to i64\n %87 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 24\n %88 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 40\n %89 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 56\n %90 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 88\n %91 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 104\n %92 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 120\n %93 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 152\n %94 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 168\n %95 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 184\n %96 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 216\n %97 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 232\n %98 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 248\n br label %99, !dbg !33\n\n99: ; preds = %.lr.ph, %99\n %.pn47206 = phi ptr addrspace(1) [ %41, %.lr.ph ], [ %275, %99 ]\n %.pn15205 = phi ptr addrspace(1) [ %46, %.lr.ph ], [ %274, %99 ]\n %.pn31204 = phi ptr addrspace(1) [ %51, %.lr.ph ], [ %273, %99 ]\n %100 = phi i32 [ 0, %.lr.ph ], [ %276, %99 ]\n %101 = phi <16 x float> [ zeroinitializer, %.lr.ph ], [ %272, %99 ]\n %102 = shufflevector <16 x float> %101, <16 x float> poison, <16 x i32> \n %103 = shl i32 %100, 4, !dbg !43\n %104 = sub i32 %5, %103, !dbg !44\n %105 = icmp slt i32 %31, %104, !dbg !45\n %106 = tail call { i32, i32, i32, i32 } asm sideeffect \"mov.u32 $0, $4;\\0A\\09mov.u32 $1, $5;\\0A\\09mov.u32 $2, $6;\\0A\\09mov.u32 $3, $7;\\0A\\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];\", \"=r,=r,=r,=r,r,r,r,r,l,b\"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %.pn31204, i1 %105) #4, !dbg !46\n %107 = extractvalue { i32, i32, i32, i32 } %106, 0, !dbg !46\n %108 = bitcast i32 %107 to <2 x half>, !dbg !46\n %109 = extractvalue { i32, i32, i32, i32 } %106, 1, !dbg !46\n %110 = bitcast i32 %109 to <2 x half>, !dbg !46\n %111 = extractvalue { i32, i32, i32, i32 } %106, 2, !dbg !46\n %112 = bitcast i32 %111 to <2 x half>, !dbg !46\n %113 = extractvalue { i32, i32, i32, i32 } %106, 3, !dbg !46\n %114 = bitcast i32 %113 to <2 x half>, !dbg !46\n %115 = tail call { i32, i32, i32, i32 } asm sideeffect \"mov.u32 $0, $4;\\0A\\09mov.u32 $1, $5;\\0A\\09mov.u32 $2, $6;\\0A\\09mov.u32 $3, $7;\\0A\\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];\", \"=r,=r,=r,=r,r,r,r,r,l,b\"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %.pn15205, i1 %105) #4, !dbg !46\n %116 = extractvalue { i32, i32, i32, i32 } %115, 0, !dbg !46\n %117 = bitcast i32 %116 to <2 x half>, !dbg !46\n %118 = extractvalue { i32, i32, i32, i32 } %115, 1, !dbg !46\n %119 = bitcast i32 %118 to <2 x half>, !dbg !46\n %120 = extractvalue { i32, i32, i32, i32 } %115, 2, !dbg !46\n %121 = bitcast i32 %120 to <2 x half>, !dbg !46\n %122 = extractvalue { i32, i32, i32, i32 } %115, 3, !dbg !46\n %123 = bitcast i32 %122 to <2 x half>, !dbg !46\n %124 = icmp slt i32 %26, %104, !dbg !47\n %125 = tail call { i32, i32, i32, i32 } asm sideeffect \"mov.u32 $0, $4;\\0A\\09mov.u32 $1, $5;\\0A\\09mov.u32 $2, $6;\\0A\\09mov.u32 $3, $7;\\0A\\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];\", \"=r,=r,=r,=r,r,r,r,r,l,b\"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %.pn47206, i1 %124) #4, !dbg !48\n %126 = extractvalue { i32, i32, i32, i32 } %125, 0, !dbg !48\n %127 = bitcast i32 %126 to <2 x half>, !dbg !48\n %128 = extractvalue { i32, i32, i32, i32 } %125, 1, !dbg !48\n %129 = bitcast i32 %128 to <2 x half>, !dbg !48\n %130 = extractvalue { i32, i32, i32, i32 } %125, 2, !dbg !48\n %131 = bitcast i32 %130 to <2 x half>, !dbg !48\n %132 = extractvalue { i32, i32, i32, i32 } %125, 3, !dbg !48\n %133 = bitcast i32 %132 to <2 x half>, !dbg !48\n %134 = shufflevector <2 x half> %108, <2 x half> %110, <4 x i32> , !dbg !49\n %135 = fpext <4 x half> %134 to <4 x float>, !dbg !49\n %136 = shufflevector <2 x half> %112, <2 x half> %114, <4 x i32> , !dbg !49\n %137 = fpext <4 x half> %136 to <4 x float>, !dbg !49\n %138 = shufflevector <2 x half> %117, <2 x half> %119, <4 x i32> , !dbg !49\n %139 = fpext <4 x half> %138 to <4 x float>, !dbg !49\n %140 = shufflevector <2 x half> %121, <2 x half> %123, <4 x i32> , !dbg !49\n %141 = fpext <4 x half> %140 to <4 x float>, !dbg !49\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49\n store <4 x float> %135, ptr addrspace(3) %54, align 16, !dbg !49\n store <4 x float> %137, ptr addrspace(3) %55, align 16, !dbg !49\n store <4 x float> %139, ptr addrspace(3) %56, align 16, !dbg !49\n store <4 x float> %141, ptr addrspace(3) %57, align 16, !dbg !49\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49\n %142 = shufflevector <2 x half> %127, <2 x half> %129, <4 x i32> , !dbg !49\n %143 = fpext <4 x half> %142 to <4 x float>, !dbg !49\n %144 = shufflevector <2 x half> %131, <2 x half> %133, <4 x i32> , !dbg !49\n %145 = fpext <4 x half> %144 to <4 x float>, !dbg !49\n %146 = load <4 x float>, ptr addrspace(3) %87, align 8, !dbg !49\n %147 = load <4 x float>, ptr addrspace(3) %88, align 8, !dbg !49\n %148 = load <2 x float>, ptr addrspace(3) %89, align 8, !dbg !49\n %149 = load <4 x float>, ptr addrspace(3) %90, align 8, !dbg !49\n %150 = load <4 x float>, ptr addrspace(3) %91, align 8, !dbg !49\n %151 = load <2 x float>, ptr addrspace(3) %92, align 8, !dbg !49\n %152 = load <4 x float>, ptr addrspace(3) %93, align 8, !dbg !49\n %153 = load <4 x float>, ptr addrspace(3) %94, align 8, !dbg !49\n %154 = load <2 x float>, ptr addrspace(3) %95, align 8, !dbg !49\n %155 = load <4 x float>, ptr addrspace(3) %96, align 8, !dbg !49\n %156 = load <4 x float>, ptr addrspace(3) %97, align 8, !dbg !49\n %157 = load <2 x float>, ptr addrspace(3) %98, align 8, !dbg !49\n %158 = load <4 x float>, ptr addrspace(3) %60, align 16, !dbg !49\n %159 = load <2 x float>, ptr addrspace(3) %61, align 16, !dbg !49\n %160 = load <4 x float>, ptr addrspace(3) %62, align 16, !dbg !49\n %161 = load <2 x float>, ptr addrspace(3) %63, align 16, !dbg !49\n %162 = load <4 x float>, ptr addrspace(3) %64, align 16, !dbg !49\n %163 = load <2 x float>, ptr addrspace(3) %65, align 16, !dbg !49\n %164 = load <4 x float>, ptr addrspace(3) %66, align 16, !dbg !49\n %165 = load <2 x float>, ptr addrspace(3) %67, align 16, !dbg !49\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49\n store <4 x float> %143, ptr addrspace(3) %54, align 16, !dbg !49\n store <4 x float> %145, ptr addrspace(3) %55, align 16, !dbg !49\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49\n %166 = load <4 x float>, ptr addrspace(3) %70, align 16, !dbg !49\n %167 = shufflevector <4 x float> %166, <4 x float> poison, <16 x i32> , !dbg !49\n %168 = load <4 x float>, ptr addrspace(3) %71, align 16, !dbg !49\n %169 = shufflevector <4 x float> %168, <4 x float> poison, <16 x i32> , !dbg !49\n %170 = load <4 x float>, ptr addrspace(3) %72, align 16, !dbg !49\n %171 = shufflevector <4 x float> %170, <4 x float> poison, <16 x i32> , !dbg !49\n %172 = load <4 x float>, ptr addrspace(3) %73, align 16, !dbg !49\n %173 = shufflevector <4 x float> %172, <4 x float> poison, <16 x i32> , !dbg !49\n %174 = load <4 x float>, ptr addrspace(3) %74, align 16, !dbg !49\n %175 = shufflevector <4 x float> %174, <4 x float> poison, <16 x i32> , !dbg !49\n %176 = load <4 x float>, ptr addrspace(3) %75, align 16, !dbg !49\n %177 = shufflevector <4 x float> %176, <4 x float> poison, <16 x i32> , !dbg !49\n %178 = load <4 x float>, ptr addrspace(3) %76, align 16, !dbg !49\n %179 = shufflevector <4 x float> %178, <4 x float> poison, <16 x i32> , !dbg !49\n %180 = load <4 x float>, ptr addrspace(3) %77, align 16, !dbg !49\n %181 = shufflevector <4 x float> %180, <4 x float> poison, <16 x i32> , !dbg !49\n %182 = load <4 x float>, ptr addrspace(3) %78, align 16, !dbg !49\n %183 = shufflevector <4 x float> %182, <4 x float> poison, <16 x i32> , !dbg !49\n %184 = load <4 x float>, ptr addrspace(3) %79, align 16, !dbg !49\n %185 = shufflevector <4 x float> %184, <4 x float> poison, <16 x i32> , !dbg !49\n %186 = load <4 x float>, ptr addrspace(3) %80, align 16, !dbg !49\n %187 = shufflevector <4 x float> %186, <4 x float> poison, <16 x i32> , !dbg !49\n %188 = load <4 x float>, ptr addrspace(3) %81, align 16, !dbg !49\n %189 = shufflevector <4 x float> %188, <4 x float> poison, <16 x i32> , !dbg !49\n %190 = load <4 x float>, ptr addrspace(3) %82, align 16, !dbg !49\n %191 = shufflevector <4 x float> %190, <4 x float> poison, <16 x i32> , !dbg !49\n %192 = load <4 x float>, ptr addrspace(3) %83, align 16, !dbg !49\n %193 = shufflevector <4 x float> %192, <4 x float> poison, <16 x i32> , !dbg !49\n %194 = load <4 x float>, ptr addrspace(3) %84, align 16, !dbg !49\n %195 = shufflevector <4 x float> %194, <4 x float> poison, <16 x i32> , !dbg !49\n %196 = load <4 x float>, ptr addrspace(3) %85, align 16, !dbg !49\n %197 = shufflevector <4 x float> %196, <4 x float> poison, <16 x i32> , !dbg !49\n %198 = shufflevector <4 x float> %158, <4 x float> %160, <16 x i32> , !dbg !49\n %199 = shufflevector <4 x float> %162, <4 x float> poison, <16 x i32> , !dbg !49\n %200 = shufflevector <16 x float> %198, <16 x float> %199, <16 x i32> , !dbg !49\n %201 = shufflevector <4 x float> %164, <4 x float> poison, <16 x i32> , !dbg !49\n %202 = shufflevector <16 x float> %200, <16 x float> %201, <16 x i32> , !dbg !49\n %203 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %202, <16 x float> %167, <16 x float> %102), !dbg !49\n %204 = shufflevector <4 x float> %158, <4 x float> %160, <16 x i32> , !dbg !49\n %205 = shufflevector <16 x float> %204, <16 x float> %199, <16 x i32> , !dbg !49\n %206 = shufflevector <16 x float> %205, <16 x float> %201, <16 x i32> , !dbg !49\n %207 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %206, <16 x float> %169, <16 x float> %203), !dbg !49\n %208 = shufflevector <4 x float> %158, <4 x float> %160, <16 x i32> , !dbg !49\n %209 = shufflevector <16 x float> %208, <16 x float> %199, <16 x i32> , !dbg !49\n %210 = shufflevector <16 x float> %209, <16 x float> %201, <16 x i32> , !dbg !49\n %211 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %210, <16 x float> %171, <16 x float> %207), !dbg !49\n %212 = shufflevector <4 x float> %158, <4 x float> %160, <16 x i32> , !dbg !49\n %213 = shufflevector <16 x float> %212, <16 x float> %199, <16 x i32> , !dbg !49\n %214 = shufflevector <16 x float> %213, <16 x float> %201, <16 x i32> , !dbg !49\n %215 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %214, <16 x float> %173, <16 x float> %211), !dbg !49\n %216 = shufflevector <2 x float> %159, <2 x float> %161, <16 x i32> , !dbg !49\n %217 = shufflevector <2 x float> %163, <2 x float> poison, <16 x i32> , !dbg !49\n %218 = shufflevector <16 x float> %216, <16 x float> %217, <16 x i32> , !dbg !49\n %219 = shufflevector <2 x float> %165, <2 x float> poison, <16 x i32> , !dbg !49\n %220 = shufflevector <16 x float> %218, <16 x float> %219, <16 x i32> , !dbg !49\n %221 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %220, <16 x float> %175, <16 x float> %215), !dbg !49\n %222 = shufflevector <2 x float> %159, <2 x float> %161, <16 x i32> , !dbg !49\n %223 = shufflevector <16 x float> %222, <16 x float> %217, <16 x i32> , !dbg !49\n %224 = shufflevector <16 x float> %223, <16 x float> %219, <16 x i32> , !dbg !49\n %225 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %224, <16 x float> %177, <16 x float> %221), !dbg !49\n %226 = shufflevector <4 x float> %146, <4 x float> %149, <16 x i32> , !dbg !49\n %227 = shufflevector <4 x float> %152, <4 x float> poison, <16 x i32> , !dbg !49\n %228 = shufflevector <16 x float> %226, <16 x float> %227, <16 x i32> , !dbg !49\n %229 = shufflevector <4 x float> %155, <4 x float> poison, <16 x i32> , !dbg !49\n %230 = shufflevector <16 x float> %228, <16 x float> %229, <16 x i32> , !dbg !49\n %231 = shufflevector <16 x float> %225, <16 x float> poison, <16 x i32> , !dbg !49\n %232 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %230, <16 x float> %179, <16 x float> %231), !dbg !49\n %233 = shufflevector <4 x float> %146, <4 x float> %149, <16 x i32> , !dbg !49\n %234 = shufflevector <16 x float> %233, <16 x float> %227, <16 x i32> , !dbg !49\n %235 = shufflevector <16 x float> %234, <16 x float> %229, <16 x i32> , !dbg !49\n %236 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %235, <16 x float> %181, <16 x float> %232), !dbg !49\n %237 = shufflevector <4 x float> %146, <4 x float> %149, <16 x i32> , !dbg !49\n %238 = shufflevector <16 x float> %237, <16 x float> %227, <16 x i32> , !dbg !49\n %239 = shufflevector <16 x float> %238, <16 x float> %229, <16 x i32> , !dbg !49\n %240 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %239, <16 x float> %183, <16 x float> %236), !dbg !49\n %241 = shufflevector <4 x float> %146, <4 x float> %149, <16 x i32> , !dbg !49\n %242 = shufflevector <16 x float> %241, <16 x float> %227, <16 x i32> , !dbg !49\n %243 = shufflevector <16 x float> %242, <16 x float> %229, <16 x i32> , !dbg !49\n %244 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %243, <16 x float> %185, <16 x float> %240), !dbg !49\n %245 = shufflevector <4 x float> %147, <4 x float> %150, <16 x i32> , !dbg !49\n %246 = shufflevector <4 x float> %153, <4 x float> poison, <16 x i32> , !dbg !49\n %247 = shufflevector <16 x float> %245, <16 x float> %246, <16 x i32> , !dbg !49\n %248 = shufflevector <4 x float> %156, <4 x float> poison, <16 x i32> , !dbg !49\n %249 = shufflevector <16 x float> %247, <16 x float> %248, <16 x i32> , !dbg !49\n %250 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %249, <16 x float> %187, <16 x float> %244), !dbg !49\n %251 = shufflevector <4 x float> %147, <4 x float> %150, <16 x i32> , !dbg !49\n %252 = shufflevector <16 x float> %251, <16 x float> %246, <16 x i32> , !dbg !49\n %253 = shufflevector <16 x float> %252, <16 x float> %248, <16 x i32> , !dbg !49\n %254 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %253, <16 x float> %189, <16 x float> %250), !dbg !49\n %255 = shufflevector <4 x float> %147, <4 x float> %150, <16 x i32> , !dbg !49\n %256 = shufflevector <16 x float> %255, <16 x float> %246, <16 x i32> , !dbg !49\n %257 = shufflevector <16 x float> %256, <16 x float> %248, <16 x i32> , !dbg !49\n %258 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %257, <16 x float> %191, <16 x float> %254), !dbg !49\n %259 = shufflevector <4 x float> %147, <4 x float> %150, <16 x i32> , !dbg !49\n %260 = shufflevector <16 x float> %259, <16 x float> %246, <16 x i32> , !dbg !49\n %261 = shufflevector <16 x float> %260, <16 x float> %248, <16 x i32> , !dbg !49\n %262 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %261, <16 x float> %193, <16 x float> %258), !dbg !49\n %263 = shufflevector <2 x float> %148, <2 x float> %151, <16 x i32> , !dbg !49\n %264 = shufflevector <2 x float> %154, <2 x float> poison, <16 x i32> , !dbg !49\n %265 = shufflevector <16 x float> %263, <16 x float> %264, <16 x i32> , !dbg !49\n %266 = shufflevector <2 x float> %157, <2 x float> poison, <16 x i32> , !dbg !49\n %267 = shufflevector <16 x float> %265, <16 x float> %266, <16 x i32> , !dbg !49\n %268 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %267, <16 x float> %195, <16 x float> %262), !dbg !49\n %269 = shufflevector <2 x float> %148, <2 x float> %151, <16 x i32> , !dbg !49\n %270 = shufflevector <16 x float> %269, <16 x float> %264, <16 x i32> , !dbg !49\n %271 = shufflevector <16 x float> %270, <16 x float> %266, <16 x i32> , !dbg !49\n %272 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %271, <16 x float> %197, <16 x float> %268), !dbg !49\n %273 = getelementptr i8, ptr addrspace(1) %.pn31204, i64 32, !dbg !50\n %274 = getelementptr i8, ptr addrspace(1) %.pn15205, i64 32, !dbg !50\n %275 = getelementptr half, ptr addrspace(1) %.pn47206, i64 %86, !dbg !51\n %276 = add nuw nsw i32 %100, 1, !dbg !33\n %exitcond.not = icmp eq i32 %276, %34, !dbg !33\n br i1 %exitcond.not, label %._crit_edge.loopexit, label %99, !dbg !33\n\n._crit_edge.loopexit: ; preds = %99\n %277 = fptrunc <16 x float> %272 to <16 x half>, !dbg !52\n br label %._crit_edge, !dbg !52\n\n._crit_edge: ; preds = %._crit_edge.loopexit, %10\n %278 = phi <16 x half> [ zeroinitializer, %10 ], [ %277, %._crit_edge.loopexit ]\n %279 = mul i32 %27, %8, !dbg !53\n %280 = mul i32 %8, %28, !dbg !53\n %281 = sext i32 %279 to i64, !dbg !54\n %282 = getelementptr half, ptr addrspace(1) %2, i64 %281, !dbg !54\n %283 = sext i32 %280 to i64, !dbg !54\n %284 = getelementptr half, ptr addrspace(1) %2, i64 %283, !dbg !54\n %285 = sext i32 %32 to i64, !dbg !55\n %286 = getelementptr half, ptr addrspace(1) %282, i64 %285, !dbg !55\n %287 = getelementptr half, ptr addrspace(1) %284, i64 %285, !dbg !55\n %288 = icmp slt i32 %27, %3, !dbg !56\n %289 = icmp slt i32 %28, %3, !dbg !56\n %290 = icmp slt i32 %32, %4, !dbg !57\n %291 = and i1 %288, %290, !dbg !58\n %292 = and i1 %289, %290, !dbg !58\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !59\n %293 = and i32 %24, 2, !dbg !59\n %294 = shl nuw nsw i32 %293, 5, !dbg !59\n %295 = shl nuw nsw i32 %24, 3, !dbg !59\n %296 = and i32 %295, 136, !dbg !59\n %297 = or disjoint i32 %294, %296, !dbg !59\n %298 = shl nuw nsw i32 %24, 2, !dbg !59\n %299 = and i32 %298, 48, !dbg !59\n %300 = or disjoint i32 %297, %299, !dbg !59\n %301 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %300, !dbg !59\n %302 = shufflevector <16 x half> %278, <16 x half> poison, <4 x i32> , !dbg !59\n store <4 x half> %302, ptr addrspace(3) %301, align 8, !dbg !59\n %303 = xor i32 %300, 288, !dbg !59\n %304 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %303, !dbg !59\n %305 = shufflevector <16 x half> %278, <16 x half> poison, <4 x i32> , !dbg !59\n store <4 x half> %305, ptr addrspace(3) %304, align 8, !dbg !59\n %306 = xor i32 %300, 520, !dbg !59\n %307 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %306, !dbg !59\n %308 = shufflevector <16 x half> %278, <16 x half> poison, <4 x i32> , !dbg !59\n store <4 x half> %308, ptr addrspace(3) %307, align 8, !dbg !59\n %309 = xor i32 %300, 808, !dbg !59\n %310 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %309, !dbg !59\n %311 = shufflevector <16 x half> %278, <16 x half> poison, <4 x i32> , !dbg !59\n store <4 x half> %311, ptr addrspace(3) %310, align 8, !dbg !59\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !59\n %312 = shl nuw nsw i32 %24, 7, !dbg !59\n %313 = and i32 %312, 768, !dbg !59\n %314 = shl nuw nsw i32 %30, 6, !dbg !59\n %315 = or disjoint i32 %313, %314, !dbg !59\n %316 = shl nuw nsw i32 %293, 4, !dbg !59\n %317 = or disjoint i32 %315, %316, !dbg !59\n %318 = shl nuw nsw i32 %24, 1, !dbg !59\n %319 = and i32 %318, 56, !dbg !59\n %320 = xor i32 %317, %319, !dbg !59\n %321 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %320, !dbg !59\n %322 = getelementptr inbounds nuw i8, ptr addrspace(3) %321, i32 128, !dbg !59\n %323 = load <2 x i32>, ptr addrspace(3) %322, align 8, !dbg !59\n %324 = xor i32 %320, 8, !dbg !59\n %325 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %324, !dbg !59\n %326 = getelementptr inbounds nuw i8, ptr addrspace(3) %325, i32 128, !dbg !59\n %327 = load <2 x i32>, ptr addrspace(3) %326, align 8, !dbg !59\n %.uncasted.extract = load i32, ptr addrspace(3) %321, align 8, !dbg !59\n %328 = getelementptr inbounds nuw i8, ptr addrspace(3) %321, i32 4, !dbg !59\n %.uncasted.extract64 = load i32, ptr addrspace(3) %328, align 4, !dbg !59\n %.uncasted.extract66 = load i32, ptr addrspace(3) %325, align 8, !dbg !59\n %329 = getelementptr inbounds nuw i8, ptr addrspace(3) %325, i32 4, !dbg !59\n %.uncasted.extract68 = load i32, ptr addrspace(3) %329, align 4, !dbg !59\n tail call void asm sideeffect \"@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };\", \"r,r,r,r,l,b\"(i32 %.uncasted.extract, i32 %.uncasted.extract64, i32 %.uncasted.extract66, i32 %.uncasted.extract68, ptr addrspace(1) %286, i1 %291) #4, !dbg !59\n %.uncasted.extract70 = extractelement <2 x i32> %323, i64 0, !dbg !59\n %.uncasted.extract72 = extractelement <2 x i32> %323, i64 1, !dbg !59\n %.uncasted.extract74 = extractelement <2 x i32> %327, i64 0, !dbg !59\n %.uncasted.extract76 = extractelement <2 x i32> %327, i64 1, !dbg !59\n tail call void asm sideeffect \"@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };\", \"r,r,r,r,l,b\"(i32 %.uncasted.extract70, i32 %.uncasted.extract72, i32 %.uncasted.extract74, i32 %.uncasted.extract76, ptr addrspace(1) %287, i1 %292) #4, !dbg !59\n ret void, !dbg !60\n}\n\n; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1\n\n; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare i32 @llvm.smin.i32(i32, i32) #1\n\n; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1\n\n; Function Attrs: convergent nocallback nounwind\ndeclare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2\n\n; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>) #3\n\nattributes #0 = { \"nvvm.reqntid\"=\"32\" }\nattributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }\nattributes #2 = { convergent nocallback nounwind }\nattributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }\nattributes #4 = { nounwind }\n\n!llvm.dbg.cu = !{!0}\n!llvm.module.flags = !{!2, !3}\n!llvm.ident = !{!4}\n\n!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: \"triton\", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)\n!1 = !DIFile(filename: \"test_complex_kernels.py\", directory: \"/scratch/findhao/tritonparse/tests\")\n!2 = !{i32 2, !\"Debug Info Version\", i32 3}\n!3 = !{i32 4, !\"nvvm-reflect-ftz\", i32 1}\n!4 = !{!\"clang version 3.8.0 (tags/RELEASE_380/final)\"}\n!5 = distinct !DISubprogram(name: \"matmul_kernel\", linkageName: \"matmul_kernel\", scope: !1, file: !1, line: 38, type: !6, scopeLine: 38, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)\n!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)\n!7 = !{}\n!8 = !DILocation(line: 47, column: 24, scope: !5)\n!9 = !DILocation(line: 40, column: 22, scope: !10, inlinedAt: !12)\n!10 = distinct !DILexicalBlockFile(scope: !5, file: !11, discriminator: 0)\n!11 = !DIFile(filename: \"standard.py\", directory: \"/scratch/findhao/pta/triton/python/triton/language\")\n!12 = !DILocation(line: 48, column: 27, scope: !5)\n!13 = !DILocation(line: 40, column: 28, scope: !10, inlinedAt: !12)\n!14 = !DILocation(line: 40, column: 22, scope: !10, inlinedAt: !15)\n!15 = !DILocation(line: 49, column: 27, scope: !5)\n!16 = !DILocation(line: 40, column: 28, scope: !10, inlinedAt: !15)\n!17 = !DILocation(line: 51, column: 22, scope: !5)\n!18 = !DILocation(line: 53, column: 33, scope: !5)\n!19 = !DILocation(line: 53, column: 46, scope: !5)\n!20 = !DILocation(line: 54, column: 33, scope: !5)\n!21 = !DILocation(line: 54, column: 27, scope: !5)\n!22 = !DILocation(line: 55, column: 19, scope: !5)\n!23 = !DILocation(line: 55, column: 40, scope: !5)\n!24 = !DILocation(line: 57, column: 23, scope: !5)\n!25 = !DILocation(line: 57, column: 51, scope: !5)\n!26 = !DILocation(line: 57, column: 38, scope: !5)\n!27 = !DILocation(line: 58, column: 23, scope: !5)\n!28 = !DILocation(line: 58, column: 51, scope: !5)\n!29 = !DILocation(line: 58, column: 38, scope: !5)\n!30 = !DILocation(line: 40, column: 22, scope: !10, inlinedAt: !31)\n!31 = !DILocation(line: 64, column: 33, scope: !5)\n!32 = !DILocation(line: 40, column: 28, scope: !10, inlinedAt: !31)\n!33 = !DILocation(line: 64, column: 22, scope: !5)\n!34 = !DILocation(line: 69, column: 33, scope: !5)\n!35 = !DILocation(line: 58, column: 68, scope: !5)\n!36 = !DILocation(line: 61, column: 40, scope: !5)\n!37 = !DILocation(line: 61, column: 52, scope: !5)\n!38 = !DILocation(line: 61, column: 22, scope: !5)\n!39 = !DILocation(line: 57, column: 68, scope: !5)\n!40 = !DILocation(line: 60, column: 41, scope: !5)\n!41 = !DILocation(line: 60, column: 53, scope: !5)\n!42 = !DILocation(line: 60, column: 22, scope: !5)\n!43 = !DILocation(line: 65, column: 59, scope: !5)\n!44 = !DILocation(line: 65, column: 55, scope: !5)\n!45 = !DILocation(line: 65, column: 51, scope: !5)\n!46 = !DILocation(line: 65, column: 20, scope: !5)\n!47 = !DILocation(line: 66, column: 51, scope: !5)\n!48 = !DILocation(line: 66, column: 20, scope: !5)\n!49 = !DILocation(line: 67, column: 33, scope: !5)\n!50 = !DILocation(line: 68, column: 18, scope: !5)\n!51 = !DILocation(line: 69, column: 18, scope: !5)\n!52 = !DILocation(line: 70, column: 23, scope: !5)\n!53 = !DILocation(line: 74, column: 33, scope: !5)\n!54 = !DILocation(line: 74, column: 21, scope: !5)\n!55 = !DILocation(line: 74, column: 52, scope: !5)\n!56 = !DILocation(line: 75, column: 33, scope: !5)\n!57 = !DILocation(line: 75, column: 58, scope: !5)\n!58 = !DILocation(line: 75, column: 39, scope: !5)\n!59 = !DILocation(line: 76, column: 21, scope: !5)\n!60 = !DILocation(line: 76, column: 4, scope: !5)\n","matmul_kernel.ptx":"//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 8.7\n.target sm_75\n.address_size 64\n\n\t// .globl\tmatmul_kernel // -- Begin function matmul_kernel\n.extern .shared .align 16 .b8 global_smem[];\n // @matmul_kernel\n.visible .entry matmul_kernel(\n\t.param .u64 .ptr .global .align 1 matmul_kernel_param_0,\n\t.param .u64 .ptr .global .align 1 matmul_kernel_param_1,\n\t.param .u64 .ptr .global .align 1 matmul_kernel_param_2,\n\t.param .u32 matmul_kernel_param_3,\n\t.param .u32 matmul_kernel_param_4,\n\t.param .u32 matmul_kernel_param_5,\n\t.param .u32 matmul_kernel_param_6,\n\t.param .u32 matmul_kernel_param_7,\n\t.param .u32 matmul_kernel_param_8,\n\t.param .u64 .ptr .global .align 1 matmul_kernel_param_9\n)\n.reqntid 32\n{\n\t.reg .pred \t%p<11>;\n\t.reg .b16 \t%rs<57>;\n\t.reg .b32 \t%r<616>;\n\t.reg .b64 \t%rd<30>;\n\t.loc\t1 38 0 // test_complex_kernels.py:38:0\n$L__func_begin0:\n\t.loc\t1 38 0 // test_complex_kernels.py:38:0\n\n// %bb.0:\n\tld.param.b32 \t%r69, [matmul_kernel_param_8];\n\tld.param.b32 \t%r590, [matmul_kernel_param_5];\n\tld.param.b32 \t%r65, [matmul_kernel_param_4];\n\tld.param.b32 \t%r64, [matmul_kernel_param_3];\n\tld.param.b64 \t%rd13, [matmul_kernel_param_2];\n$L__tmp0:\n\t.loc\t1 47 24 // test_complex_kernels.py:47:24\n\tmov.u32 \t%r78, %ctaid.x;\n$L__tmp1:\n\t.loc\t2 40 22 // standard.py:40:22 @[ test_complex_kernels.py:48:27 ]\n\tadd.s32 \t%r79, %r64, 31;\n\t.loc\t2 40 28 // standard.py:40:28 @[ test_complex_kernels.py:48:27 ]\n\tshr.s32 \t%r80, %r79, 31;\n\tshr.u32 \t%r81, %r80, 27;\n\tadd.s32 \t%r82, %r79, %r81;\n\tshr.s32 \t%r83, %r82, 5;\n$L__tmp2:\n\t.loc\t2 40 22 // standard.py:40:22 @[ test_complex_kernels.py:49:27 ]\n\tadd.s32 \t%r84, %r65, 15;\n\t.loc\t2 40 28 // standard.py:40:28 @[ test_complex_kernels.py:49:27 ]\n\tshr.s32 \t%r85, %r84, 31;\n\tshr.u32 \t%r86, %r85, 28;\n\tadd.s32 \t%r87, %r84, %r86;\n\tshr.s32 \t%r88, %r87, 4;\n$L__tmp3:\n\t.loc\t1 51 22 // test_complex_kernels.py:51:22\n\tdiv.s32 \t%r90, %r78, %r88;\n\t.loc\t1 53 33 // test_complex_kernels.py:53:33\n\tsub.s32 \t%r91, %r83, %r90;\n\t.loc\t1 53 46 // test_complex_kernels.py:53:46\n\tmin.s32 \t%r92, %r91, 1;\n\t.loc\t1 54 33 // test_complex_kernels.py:54:33\n\trem.s32 \t%r93, %r78, %r92;\n\t.loc\t1 54 27 // test_complex_kernels.py:54:27\n\tadd.s32 \t%r94, %r93, %r90;\n\t.loc\t1 55 19 // test_complex_kernels.py:55:19\n\tmul.lo.s32 \t%r95, %r90, %r88;\n\tsub.s32 \t%r96, %r78, %r95;\n\t.loc\t1 55 40 // test_complex_kernels.py:55:40\n\tdiv.s32 \t%r97, %r96, %r92;\n\t.loc\t1 57 23 // test_complex_kernels.py:57:23\n\tshl.b32 \t%r98, %r94, 5;\n\t.loc\t1 57 51 // test_complex_kernels.py:57:51\n\tmov.u32 \t%r1, %tid.x;\n\tbfe.u32 \t%r2, %r1, 1, 4;\n\t.loc\t1 57 38 // test_complex_kernels.py:57:38\n\tor.b32 \t%r3, %r98, %r2;\n\tor.b32 \t%r4, %r3, 16;\n\t.loc\t1 58 23 // test_complex_kernels.py:58:23\n\tshl.b32 \t%r99, %r97, 4;\n\t.loc\t1 58 51 // test_complex_kernels.py:58:51\n\tand.b32 \t%r5, %r1, 1;\n\tshl.b32 \t%r6, %r5, 3;\n\t.loc\t1 58 38 // test_complex_kernels.py:58:38\n\tor.b32 \t%r7, %r99, %r6;\n$L__tmp4:\n\t.loc\t2 40 22 // standard.py:40:22 @[ test_complex_kernels.py:64:33 ]\n\tadd.s32 \t%r100, %r590, 15;\n$L__tmp5:\n\t.loc\t1 64 22 // test_complex_kernels.py:64:22\n\tsetp.lt.s32 \t%p1, %r100, 16;\n\tmov.b32 \t%r608, 0;\n\tmov.b32 \t%r589, global_smem;\n\tmov.b32 \t%r609, %r608;\n\tmov.b32 \t%r610, %r608;\n\tmov.b32 \t%r611, %r608;\n\tmov.b32 \t%r612, %r608;\n\tmov.b32 \t%r613, %r608;\n\tmov.b32 \t%r614, %r608;\n\tmov.b32 \t%r615, %r608;\n\t@%p1 bra \t$L__BB0_4;\n// %bb.1: // %.lr.ph\n\t.loc\t1 0 22 // test_complex_kernels.py:0:22\n\tld.param.b32 \t%r68, [matmul_kernel_param_7];\n\tld.param.b32 \t%r67, [matmul_kernel_param_6];\n\tld.param.b64 \t%rd12, [matmul_kernel_param_1];\n\tld.param.b64 \t%rd11, [matmul_kernel_param_0];\n\tshr.s32 \t%r101, %r100, 31;\n\tshr.u32 \t%r102, %r101, 28;\n\tadd.s32 \t%r103, %r100, %r102;\n\tshr.s32 \t%r591, %r103, 4;\n\t.loc\t1 69 33 // test_complex_kernels.py:69:33\n\tshl.b32 \t%r120, %r68, 4;\n\t.loc\t1 58 68 // test_complex_kernels.py:58:68\n\trem.s32 \t%r121, %r7, %r65;\n\t.loc\t1 61 52 // test_complex_kernels.py:61:52\n\tmad.lo.s32 \t%r122, %r68, %r2, %r121;\n\t.loc\t1 61 22 // test_complex_kernels.py:61:22\n\tmul.wide.s32 \t%rd14, %r122, 2;\n\tadd.s64 \t%rd27, %rd12, %rd14;\n\t.loc\t1 57 68 // test_complex_kernels.py:57:68\n\trem.s32 \t%r123, %r4, %r64;\n\t.loc\t1 60 53 // test_complex_kernels.py:60:53\n\tmad.lo.s32 \t%r124, %r123, %r67, %r6;\n\t.loc\t1 60 22 // test_complex_kernels.py:60:22\n\tmul.wide.s32 \t%rd15, %r124, 2;\n\tadd.s64 \t%rd28, %rd11, %rd15;\n\t.loc\t1 57 68 // test_complex_kernels.py:57:68\n\trem.s32 \t%r125, %r3, %r64;\n\t.loc\t1 60 53 // test_complex_kernels.py:60:53\n\tmad.lo.s32 \t%r126, %r125, %r67, %r6;\n\t.loc\t1 60 22 // test_complex_kernels.py:60:22\n\tmul.wide.s32 \t%rd16, %r126, 2;\n\tadd.s64 \t%rd29, %rd11, %rd16;\n\tshl.b32 \t%r127, %r1, 5;\n\tand.b32 \t%r128, %r127, 992;\n\tadd.s32 \t%r9, %r589, %r128;\n\tshl.b32 \t%r130, %r1, 6;\n\tand.b32 \t%r131, %r130, 1792;\n\tadd.s32 \t%r10, %r589, %r131;\n\tshl.b32 \t%r132, %r1, 4;\n\tand.b32 \t%r133, %r132, 48;\n\tadd.s32 \t%r11, %r589, %r133;\n\t.loc\t1 64 22 // test_complex_kernels.py:64:22\n\tmul.wide.s32 \t%rd4, %r120, 2;\n\tmov.b32 \t%r592, 0f00000000;\n\tmov.b32 \t%r593, %r592;\n\tmov.b32 \t%r594, %r592;\n\tmov.b32 \t%r595, %r592;\n\tmov.b32 \t%r596, %r592;\n\tmov.b32 \t%r597, %r592;\n\tmov.b32 \t%r598, %r592;\n\tmov.b32 \t%r599, %r592;\n\tmov.b32 \t%r600, %r592;\n\tmov.b32 \t%r601, %r592;\n\tmov.b32 \t%r602, %r592;\n\tmov.b32 \t%r603, %r592;\n\tmov.b32 \t%r604, %r592;\n\tmov.b32 \t%r605, %r592;\n\tmov.b32 \t%r606, %r592;\n\tmov.b32 \t%r607, %r592;\n$L__BB0_2: // =>This Inner Loop Header: Depth=1\n\t.loc\t1 65 51 // test_complex_kernels.py:65:51\n\tsetp.lt.s32 \t%p2, %r6, %r590;\n\tmov.b32 \t%r138, 0;\n\t.loc\t1 65 20 // test_complex_kernels.py:65:20\n\t// begin inline asm\n\tmov.u32 %r134, %r138;\n\tmov.u32 %r135, %r138;\n\tmov.u32 %r136, %r138;\n\tmov.u32 %r137, %r138;\n\t@%p2 ld.global.v4.b32 { %r134, %r135, %r136, %r137 }, [ %rd29 + 0 ];\n\t// end inline asm\n\t// begin inline asm\n\tmov.u32 %r142, %r138;\n\tmov.u32 %r143, %r138;\n\tmov.u32 %r144, %r138;\n\tmov.u32 %r145, %r138;\n\t@%p2 ld.global.v4.b32 { %r142, %r143, %r144, %r145 }, [ %rd28 + 0 ];\n\t// end inline asm\n\t.loc\t1 66 51 // test_complex_kernels.py:66:51\n\tsetp.lt.s32 \t%p4, %r2, %r590;\n\t.loc\t1 66 20 // test_complex_kernels.py:66:20\n\t// begin inline asm\n\tmov.u32 %r150, %r138;\n\tmov.u32 %r151, %r138;\n\tmov.u32 %r152, %r138;\n\tmov.u32 %r153, %r138;\n\t@%p4 ld.global.v4.b32 { %r150, %r151, %r152, %r153 }, [ %rd27 + 0 ];\n\t// end inline asm\n\t.loc\t1 67 33 // test_complex_kernels.py:67:33\n\tmov.b32 \t{%rs1, %rs2}, %r135;\n\tcvt.f32.f16 \t%r158, %rs2;\n\tcvt.f32.f16 \t%r159, %rs1;\n\tmov.b32 \t{%rs3, %rs4}, %r134;\n\tcvt.f32.f16 \t%r160, %rs4;\n\tcvt.f32.f16 \t%r161, %rs3;\n\tmov.b32 \t{%rs5, %rs6}, %r137;\n\tcvt.f32.f16 \t%r162, %rs6;\n\tcvt.f32.f16 \t%r163, %rs5;\n\tmov.b32 \t{%rs7, %rs8}, %r136;\n\tcvt.f32.f16 \t%r164, %rs8;\n\tcvt.f32.f16 \t%r165, %rs7;\n\tmov.b32 \t{%rs9, %rs10}, %r143;\n\tcvt.f32.f16 \t%r166, %rs10;\n\tcvt.f32.f16 \t%r167, %rs9;\n\tmov.b32 \t{%rs11, %rs12}, %r142;\n\tcvt.f32.f16 \t%r168, %rs12;\n\tcvt.f32.f16 \t%r169, %rs11;\n\tmov.b32 \t{%rs13, %rs14}, %r145;\n\tcvt.f32.f16 \t%r170, %rs14;\n\tcvt.f32.f16 \t%r171, %rs13;\n\tmov.b32 \t{%rs15, %rs16}, %r144;\n\tcvt.f32.f16 \t%r172, %rs16;\n\tcvt.f32.f16 \t%r173, %rs15;\n\tbar.sync \t0;\n\tst.shared.v4.b32 \t[%r9], {%r161, %r160, %r159, %r158};\n\tst.shared.v4.b32 \t[%r9+16], {%r165, %r164, %r163, %r162};\n\tst.shared.v4.b32 \t[%r9+1024], {%r169, %r168, %r167, %r166};\n\tst.shared.v4.b32 \t[%r9+1040], {%r173, %r172, %r171, %r170};\n\tbar.sync \t0;\n\tmov.b32 \t{%rs17, %rs18}, %r151;\n\tcvt.f32.f16 \t%r174, %rs18;\n\tcvt.f32.f16 \t%r175, %rs17;\n\tmov.b32 \t{%rs19, %rs20}, %r150;\n\tcvt.f32.f16 \t%r176, %rs20;\n\tcvt.f32.f16 \t%r177, %rs19;\n\tmov.b32 \t{%rs21, %rs22}, %r153;\n\tcvt.f32.f16 \t%r178, %rs22;\n\tcvt.f32.f16 \t%r179, %rs21;\n\tmov.b32 \t{%rs23, %rs24}, %r152;\n\tcvt.f32.f16 \t%r180, %rs24;\n\tcvt.f32.f16 \t%r181, %rs23;\n\tld.shared.v2.b32 \t{%r182, %r183}, [%r10+32];\n\tld.shared.v2.b32 \t{%r184, %r185}, [%r10+24];\n\tld.shared.v2.b32 \t{%r186, %r187}, [%r10+48];\n\tld.shared.v2.b32 \t{%r188, %r189}, [%r10+40];\n\tld.shared.v2.b32 \t{%r190, %r191}, [%r10+56];\n\tld.shared.v2.b32 \t{%r192, %r193}, [%r10+96];\n\tld.shared.v2.b32 \t{%r194, %r195}, [%r10+88];\n\tld.shared.v2.b32 \t{%r196, %r197}, [%r10+112];\n\tld.shared.v2.b32 \t{%r198, %r199}, [%r10+104];\n\tld.shared.v2.b32 \t{%r200, %r201}, [%r10+120];\n\tld.shared.v2.b32 \t{%r202, %r203}, [%r10+160];\n\tld.shared.v2.b32 \t{%r204, %r205}, [%r10+152];\n\tld.shared.v2.b32 \t{%r206, %r207}, [%r10+176];\n\tld.shared.v2.b32 \t{%r208, %r209}, [%r10+168];\n\tld.shared.v2.b32 \t{%r210, %r211}, [%r10+184];\n\tld.shared.v2.b32 \t{%r212, %r213}, [%r10+224];\n\tld.shared.v2.b32 \t{%r214, %r215}, [%r10+216];\n\tld.shared.v2.b32 \t{%r216, %r217}, [%r10+240];\n\tld.shared.v2.b32 \t{%r218, %r219}, [%r10+232];\n\tld.shared.v2.b32 \t{%r220, %r221}, [%r10+248];\n\tld.shared.v4.b32 \t{%r222, %r223, %r224, %r225}, [%r10];\n\tld.shared.v2.b32 \t{%r226, %r227}, [%r10+16];\n\tld.shared.v4.b32 \t{%r228, %r229, %r230, %r231}, [%r10+64];\n\tld.shared.v2.b32 \t{%r232, %r233}, [%r10+80];\n\tld.shared.v4.b32 \t{%r234, %r235, %r236, %r237}, [%r10+128];\n\tld.shared.v2.b32 \t{%r238, %r239}, [%r10+144];\n\tld.shared.v4.b32 \t{%r240, %r241, %r242, %r243}, [%r10+192];\n\tld.shared.v2.b32 \t{%r244, %r245}, [%r10+208];\n\tbar.sync \t0;\n\tst.shared.v4.b32 \t[%r9], {%r177, %r176, %r175, %r174};\n\tst.shared.v4.b32 \t[%r9+16], {%r181, %r180, %r179, %r178};\n\tbar.sync \t0;\n\tld.shared.v4.b32 \t{%r246, %r247, %r248, %r249}, [%r11];\n\tld.shared.v4.b32 \t{%r250, %r251, %r252, %r253}, [%r11+64];\n\tld.shared.v4.b32 \t{%r254, %r255, %r256, %r257}, [%r11+128];\n\tld.shared.v4.b32 \t{%r258, %r259, %r260, %r261}, [%r11+192];\n\tld.shared.v4.b32 \t{%r262, %r263, %r264, %r265}, [%r11+256];\n\tld.shared.v4.b32 \t{%r266, %r267, %r268, %r269}, [%r11+320];\n\tld.shared.v4.b32 \t{%r270, %r271, %r272, %r273}, [%r11+384];\n\tld.shared.v4.b32 \t{%r274, %r275, %r276, %r277}, [%r11+448];\n\tld.shared.v4.b32 \t{%r278, %r279, %r280, %r281}, [%r11+512];\n\tld.shared.v4.b32 \t{%r282, %r283, %r284, %r285}, [%r11+576];\n\tld.shared.v4.b32 \t{%r286, %r287, %r288, %r289}, [%r11+640];\n\tld.shared.v4.b32 \t{%r290, %r291, %r292, %r293}, [%r11+704];\n\tld.shared.v4.b32 \t{%r294, %r295, %r296, %r297}, [%r11+768];\n\tld.shared.v4.b32 \t{%r298, %r299, %r300, %r301}, [%r11+832];\n\tld.shared.v4.b32 \t{%r302, %r303, %r304, %r305}, [%r11+896];\n\tld.shared.v4.b32 \t{%r306, %r307, %r308, %r309}, [%r11+960];\n\tfma.rn.f32 \t%r310, %r240, %r249, %r607;\n\tfma.rn.f32 \t%r311, %r234, %r249, %r606;\n\tfma.rn.f32 \t%r312, %r228, %r249, %r605;\n\tfma.rn.f32 \t%r313, %r222, %r246, %r592;\n\tfma.rn.f32 \t%r314, %r228, %r246, %r593;\n\tfma.rn.f32 \t%r315, %r234, %r246, %r594;\n\tfma.rn.f32 \t%r316, %r240, %r246, %r595;\n\tfma.rn.f32 \t%r317, %r222, %r247, %r596;\n\tfma.rn.f32 \t%r318, %r228, %r247, %r597;\n\tfma.rn.f32 \t%r319, %r234, %r247, %r598;\n\tfma.rn.f32 \t%r320, %r240, %r247, %r599;\n\tfma.rn.f32 \t%r321, %r222, %r248, %r600;\n\tfma.rn.f32 \t%r322, %r228, %r248, %r601;\n\tfma.rn.f32 \t%r323, %r234, %r248, %r602;\n\tfma.rn.f32 \t%r324, %r240, %r248, %r603;\n\tfma.rn.f32 \t%r325, %r222, %r249, %r604;\n\tfma.rn.f32 \t%r326, %r223, %r253, %r325;\n\tfma.rn.f32 \t%r327, %r241, %r252, %r324;\n\tfma.rn.f32 \t%r328, %r235, %r252, %r323;\n\tfma.rn.f32 \t%r329, %r229, %r252, %r322;\n\tfma.rn.f32 \t%r330, %r223, %r252, %r321;\n\tfma.rn.f32 \t%r331, %r241, %r251, %r320;\n\tfma.rn.f32 \t%r332, %r235, %r251, %r319;\n\tfma.rn.f32 \t%r333, %r229, %r251, %r318;\n\tfma.rn.f32 \t%r334, %r223, %r251, %r317;\n\tfma.rn.f32 \t%r335, %r241, %r250, %r316;\n\tfma.rn.f32 \t%r336, %r235, %r250, %r315;\n\tfma.rn.f32 \t%r337, %r229, %r250, %r314;\n\tfma.rn.f32 \t%r338, %r223, %r250, %r313;\n\tfma.rn.f32 \t%r339, %r229, %r253, %r312;\n\tfma.rn.f32 \t%r340, %r235, %r253, %r311;\n\tfma.rn.f32 \t%r341, %r241, %r253, %r310;\n\tfma.rn.f32 \t%r342, %r242, %r257, %r341;\n\tfma.rn.f32 \t%r343, %r236, %r257, %r340;\n\tfma.rn.f32 \t%r344, %r230, %r257, %r339;\n\tfma.rn.f32 \t%r345, %r224, %r254, %r338;\n\tfma.rn.f32 \t%r346, %r230, %r254, %r337;\n\tfma.rn.f32 \t%r347, %r236, %r254, %r336;\n\tfma.rn.f32 \t%r348, %r242, %r254, %r335;\n\tfma.rn.f32 \t%r349, %r224, %r255, %r334;\n\tfma.rn.f32 \t%r350, %r230, %r255, %r333;\n\tfma.rn.f32 \t%r351, %r236, %r255, %r332;\n\tfma.rn.f32 \t%r352, %r242, %r255, %r331;\n\tfma.rn.f32 \t%r353, %r224, %r256, %r330;\n\tfma.rn.f32 \t%r354, %r230, %r256, %r329;\n\tfma.rn.f32 \t%r355, %r236, %r256, %r328;\n\tfma.rn.f32 \t%r356, %r242, %r256, %r327;\n\tfma.rn.f32 \t%r357, %r224, %r257, %r326;\n\tfma.rn.f32 \t%r358, %r225, %r261, %r357;\n\tfma.rn.f32 \t%r359, %r243, %r260, %r356;\n\tfma.rn.f32 \t%r360, %r237, %r260, %r355;\n\tfma.rn.f32 \t%r361, %r231, %r260, %r354;\n\tfma.rn.f32 \t%r362, %r225, %r260, %r353;\n\tfma.rn.f32 \t%r363, %r243, %r259, %r352;\n\tfma.rn.f32 \t%r364, %r237, %r259, %r351;\n\tfma.rn.f32 \t%r365, %r231, %r259, %r350;\n\tfma.rn.f32 \t%r366, %r225, %r259, %r349;\n\tfma.rn.f32 \t%r367, %r243, %r258, %r348;\n\tfma.rn.f32 \t%r368, %r237, %r258, %r347;\n\tfma.rn.f32 \t%r369, %r231, %r258, %r346;\n\tfma.rn.f32 \t%r370, %r225, %r258, %r345;\n\tfma.rn.f32 \t%r371, %r231, %r261, %r344;\n\tfma.rn.f32 \t%r372, %r237, %r261, %r343;\n\tfma.rn.f32 \t%r373, %r243, %r261, %r342;\n\tfma.rn.f32 \t%r374, %r244, %r265, %r373;\n\tfma.rn.f32 \t%r375, %r238, %r265, %r372;\n\tfma.rn.f32 \t%r376, %r232, %r265, %r371;\n\tfma.rn.f32 \t%r377, %r226, %r262, %r370;\n\tfma.rn.f32 \t%r378, %r232, %r262, %r369;\n\tfma.rn.f32 \t%r379, %r238, %r262, %r368;\n\tfma.rn.f32 \t%r380, %r244, %r262, %r367;\n\tfma.rn.f32 \t%r381, %r226, %r263, %r366;\n\tfma.rn.f32 \t%r382, %r232, %r263, %r365;\n\tfma.rn.f32 \t%r383, %r238, %r263, %r364;\n\tfma.rn.f32 \t%r384, %r244, %r263, %r363;\n\tfma.rn.f32 \t%r385, %r226, %r264, %r362;\n\tfma.rn.f32 \t%r386, %r232, %r264, %r361;\n\tfma.rn.f32 \t%r387, %r238, %r264, %r360;\n\tfma.rn.f32 \t%r388, %r244, %r264, %r359;\n\tfma.rn.f32 \t%r389, %r226, %r265, %r358;\n\tfma.rn.f32 \t%r390, %r227, %r269, %r389;\n\tfma.rn.f32 \t%r391, %r245, %r268, %r388;\n\tfma.rn.f32 \t%r392, %r239, %r268, %r387;\n\tfma.rn.f32 \t%r393, %r233, %r268, %r386;\n\tfma.rn.f32 \t%r394, %r227, %r268, %r385;\n\tfma.rn.f32 \t%r395, %r245, %r267, %r384;\n\tfma.rn.f32 \t%r396, %r239, %r267, %r383;\n\tfma.rn.f32 \t%r397, %r233, %r267, %r382;\n\tfma.rn.f32 \t%r398, %r227, %r267, %r381;\n\tfma.rn.f32 \t%r399, %r245, %r266, %r380;\n\tfma.rn.f32 \t%r400, %r239, %r266, %r379;\n\tfma.rn.f32 \t%r401, %r233, %r266, %r378;\n\tfma.rn.f32 \t%r402, %r227, %r266, %r377;\n\tfma.rn.f32 \t%r403, %r233, %r269, %r376;\n\tfma.rn.f32 \t%r404, %r239, %r269, %r375;\n\tfma.rn.f32 \t%r405, %r245, %r269, %r374;\n\tfma.rn.f32 \t%r406, %r214, %r273, %r405;\n\tfma.rn.f32 \t%r407, %r204, %r273, %r404;\n\tfma.rn.f32 \t%r408, %r194, %r273, %r403;\n\tfma.rn.f32 \t%r409, %r184, %r270, %r402;\n\tfma.rn.f32 \t%r410, %r194, %r270, %r401;\n\tfma.rn.f32 \t%r411, %r204, %r270, %r400;\n\tfma.rn.f32 \t%r412, %r214, %r270, %r399;\n\tfma.rn.f32 \t%r413, %r184, %r271, %r398;\n\tfma.rn.f32 \t%r414, %r194, %r271, %r397;\n\tfma.rn.f32 \t%r415, %r204, %r271, %r396;\n\tfma.rn.f32 \t%r416, %r214, %r271, %r395;\n\tfma.rn.f32 \t%r417, %r184, %r272, %r394;\n\tfma.rn.f32 \t%r418, %r194, %r272, %r393;\n\tfma.rn.f32 \t%r419, %r204, %r272, %r392;\n\tfma.rn.f32 \t%r420, %r214, %r272, %r391;\n\tfma.rn.f32 \t%r421, %r184, %r273, %r390;\n\tfma.rn.f32 \t%r422, %r185, %r277, %r421;\n\tfma.rn.f32 \t%r423, %r215, %r276, %r420;\n\tfma.rn.f32 \t%r424, %r205, %r276, %r419;\n\tfma.rn.f32 \t%r425, %r195, %r276, %r418;\n\tfma.rn.f32 \t%r426, %r185, %r276, %r417;\n\tfma.rn.f32 \t%r427, %r215, %r275, %r416;\n\tfma.rn.f32 \t%r428, %r205, %r275, %r415;\n\tfma.rn.f32 \t%r429, %r195, %r275, %r414;\n\tfma.rn.f32 \t%r430, %r185, %r275, %r413;\n\tfma.rn.f32 \t%r431, %r215, %r274, %r412;\n\tfma.rn.f32 \t%r432, %r205, %r274, %r411;\n\tfma.rn.f32 \t%r433, %r195, %r274, %r410;\n\tfma.rn.f32 \t%r434, %r185, %r274, %r409;\n\tfma.rn.f32 \t%r435, %r195, %r277, %r408;\n\tfma.rn.f32 \t%r436, %r205, %r277, %r407;\n\tfma.rn.f32 \t%r437, %r215, %r277, %r406;\n\tfma.rn.f32 \t%r438, %r212, %r281, %r437;\n\tfma.rn.f32 \t%r439, %r202, %r281, %r436;\n\tfma.rn.f32 \t%r440, %r192, %r281, %r435;\n\tfma.rn.f32 \t%r441, %r182, %r278, %r434;\n\tfma.rn.f32 \t%r442, %r192, %r278, %r433;\n\tfma.rn.f32 \t%r443, %r202, %r278, %r432;\n\tfma.rn.f32 \t%r444, %r212, %r278, %r431;\n\tfma.rn.f32 \t%r445, %r182, %r279, %r430;\n\tfma.rn.f32 \t%r446, %r192, %r279, %r429;\n\tfma.rn.f32 \t%r447, %r202, %r279, %r428;\n\tfma.rn.f32 \t%r448, %r212, %r279, %r427;\n\tfma.rn.f32 \t%r449, %r182, %r280, %r426;\n\tfma.rn.f32 \t%r450, %r192, %r280, %r425;\n\tfma.rn.f32 \t%r451, %r202, %r280, %r424;\n\tfma.rn.f32 \t%r452, %r212, %r280, %r423;\n\tfma.rn.f32 \t%r453, %r182, %r281, %r422;\n\tfma.rn.f32 \t%r454, %r183, %r285, %r453;\n\tfma.rn.f32 \t%r455, %r213, %r284, %r452;\n\tfma.rn.f32 \t%r456, %r203, %r284, %r451;\n\tfma.rn.f32 \t%r457, %r193, %r284, %r450;\n\tfma.rn.f32 \t%r458, %r183, %r284, %r449;\n\tfma.rn.f32 \t%r459, %r213, %r283, %r448;\n\tfma.rn.f32 \t%r460, %r203, %r283, %r447;\n\tfma.rn.f32 \t%r461, %r193, %r283, %r446;\n\tfma.rn.f32 \t%r462, %r183, %r283, %r445;\n\tfma.rn.f32 \t%r463, %r213, %r282, %r444;\n\tfma.rn.f32 \t%r464, %r203, %r282, %r443;\n\tfma.rn.f32 \t%r465, %r193, %r282, %r442;\n\tfma.rn.f32 \t%r466, %r183, %r282, %r441;\n\tfma.rn.f32 \t%r467, %r193, %r285, %r440;\n\tfma.rn.f32 \t%r468, %r203, %r285, %r439;\n\tfma.rn.f32 \t%r469, %r213, %r285, %r438;\n\tfma.rn.f32 \t%r470, %r218, %r289, %r469;\n\tfma.rn.f32 \t%r471, %r208, %r289, %r468;\n\tfma.rn.f32 \t%r472, %r198, %r289, %r467;\n\tfma.rn.f32 \t%r473, %r188, %r286, %r466;\n\tfma.rn.f32 \t%r474, %r198, %r286, %r465;\n\tfma.rn.f32 \t%r475, %r208, %r286, %r464;\n\tfma.rn.f32 \t%r476, %r218, %r286, %r463;\n\tfma.rn.f32 \t%r477, %r188, %r287, %r462;\n\tfma.rn.f32 \t%r478, %r198, %r287, %r461;\n\tfma.rn.f32 \t%r479, %r208, %r287, %r460;\n\tfma.rn.f32 \t%r480, %r218, %r287, %r459;\n\tfma.rn.f32 \t%r481, %r188, %r288, %r458;\n\tfma.rn.f32 \t%r482, %r198, %r288, %r457;\n\tfma.rn.f32 \t%r483, %r208, %r288, %r456;\n\tfma.rn.f32 \t%r484, %r218, %r288, %r455;\n\tfma.rn.f32 \t%r485, %r188, %r289, %r454;\n\tfma.rn.f32 \t%r486, %r189, %r293, %r485;\n\tfma.rn.f32 \t%r487, %r219, %r292, %r484;\n\tfma.rn.f32 \t%r488, %r209, %r292, %r483;\n\tfma.rn.f32 \t%r489, %r199, %r292, %r482;\n\tfma.rn.f32 \t%r490, %r189, %r292, %r481;\n\tfma.rn.f32 \t%r491, %r219, %r291, %r480;\n\tfma.rn.f32 \t%r492, %r209, %r291, %r479;\n\tfma.rn.f32 \t%r493, %r199, %r291, %r478;\n\tfma.rn.f32 \t%r494, %r189, %r291, %r477;\n\tfma.rn.f32 \t%r495, %r219, %r290, %r476;\n\tfma.rn.f32 \t%r496, %r209, %r290, %r475;\n\tfma.rn.f32 \t%r497, %r199, %r290, %r474;\n\tfma.rn.f32 \t%r498, %r189, %r290, %r473;\n\tfma.rn.f32 \t%r499, %r199, %r293, %r472;\n\tfma.rn.f32 \t%r500, %r209, %r293, %r471;\n\tfma.rn.f32 \t%r501, %r219, %r293, %r470;\n\tfma.rn.f32 \t%r502, %r216, %r297, %r501;\n\tfma.rn.f32 \t%r503, %r206, %r297, %r500;\n\tfma.rn.f32 \t%r504, %r196, %r297, %r499;\n\tfma.rn.f32 \t%r505, %r186, %r294, %r498;\n\tfma.rn.f32 \t%r506, %r196, %r294, %r497;\n\tfma.rn.f32 \t%r507, %r206, %r294, %r496;\n\tfma.rn.f32 \t%r508, %r216, %r294, %r495;\n\tfma.rn.f32 \t%r509, %r186, %r295, %r494;\n\tfma.rn.f32 \t%r510, %r196, %r295, %r493;\n\tfma.rn.f32 \t%r511, %r206, %r295, %r492;\n\tfma.rn.f32 \t%r512, %r216, %r295, %r491;\n\tfma.rn.f32 \t%r513, %r186, %r296, %r490;\n\tfma.rn.f32 \t%r514, %r196, %r296, %r489;\n\tfma.rn.f32 \t%r515, %r206, %r296, %r488;\n\tfma.rn.f32 \t%r516, %r216, %r296, %r487;\n\tfma.rn.f32 \t%r517, %r186, %r297, %r486;\n\tfma.rn.f32 \t%r518, %r187, %r301, %r517;\n\tfma.rn.f32 \t%r519, %r217, %r300, %r516;\n\tfma.rn.f32 \t%r520, %r207, %r300, %r515;\n\tfma.rn.f32 \t%r521, %r197, %r300, %r514;\n\tfma.rn.f32 \t%r522, %r187, %r300, %r513;\n\tfma.rn.f32 \t%r523, %r217, %r299, %r512;\n\tfma.rn.f32 \t%r524, %r207, %r299, %r511;\n\tfma.rn.f32 \t%r525, %r197, %r299, %r510;\n\tfma.rn.f32 \t%r526, %r187, %r299, %r509;\n\tfma.rn.f32 \t%r527, %r217, %r298, %r508;\n\tfma.rn.f32 \t%r528, %r207, %r298, %r507;\n\tfma.rn.f32 \t%r529, %r197, %r298, %r506;\n\tfma.rn.f32 \t%r530, %r187, %r298, %r505;\n\tfma.rn.f32 \t%r531, %r197, %r301, %r504;\n\tfma.rn.f32 \t%r532, %r207, %r301, %r503;\n\tfma.rn.f32 \t%r533, %r217, %r301, %r502;\n\tfma.rn.f32 \t%r534, %r220, %r305, %r533;\n\tfma.rn.f32 \t%r535, %r210, %r305, %r532;\n\tfma.rn.f32 \t%r536, %r200, %r305, %r531;\n\tfma.rn.f32 \t%r537, %r190, %r302, %r530;\n\tfma.rn.f32 \t%r538, %r200, %r302, %r529;\n\tfma.rn.f32 \t%r539, %r210, %r302, %r528;\n\tfma.rn.f32 \t%r540, %r220, %r302, %r527;\n\tfma.rn.f32 \t%r541, %r190, %r303, %r526;\n\tfma.rn.f32 \t%r542, %r200, %r303, %r525;\n\tfma.rn.f32 \t%r543, %r210, %r303, %r524;\n\tfma.rn.f32 \t%r544, %r220, %r303, %r523;\n\tfma.rn.f32 \t%r545, %r190, %r304, %r522;\n\tfma.rn.f32 \t%r546, %r200, %r304, %r521;\n\tfma.rn.f32 \t%r547, %r210, %r304, %r520;\n\tfma.rn.f32 \t%r548, %r220, %r304, %r519;\n\tfma.rn.f32 \t%r549, %r190, %r305, %r518;\n\tfma.rn.f32 \t%r604, %r191, %r309, %r549;\n\tfma.rn.f32 \t%r603, %r221, %r308, %r548;\n\tfma.rn.f32 \t%r602, %r211, %r308, %r547;\n\tfma.rn.f32 \t%r601, %r201, %r308, %r546;\n\tfma.rn.f32 \t%r600, %r191, %r308, %r545;\n\tfma.rn.f32 \t%r599, %r221, %r307, %r544;\n\tfma.rn.f32 \t%r598, %r211, %r307, %r543;\n\tfma.rn.f32 \t%r597, %r201, %r307, %r542;\n\tfma.rn.f32 \t%r596, %r191, %r307, %r541;\n\tfma.rn.f32 \t%r595, %r221, %r306, %r540;\n\tfma.rn.f32 \t%r594, %r211, %r306, %r539;\n\tfma.rn.f32 \t%r593, %r201, %r306, %r538;\n\tfma.rn.f32 \t%r592, %r191, %r306, %r537;\n\tfma.rn.f32 \t%r605, %r201, %r309, %r536;\n\tfma.rn.f32 \t%r606, %r211, %r309, %r535;\n\tfma.rn.f32 \t%r607, %r221, %r309, %r534;\n\t.loc\t1 68 18 // test_complex_kernels.py:68:18\n\tadd.s64 \t%rd29, %rd29, 32;\n\tadd.s64 \t%rd28, %rd28, 32;\n\t.loc\t1 64 22 // test_complex_kernels.py:64:22\n\tadd.s64 \t%rd27, %rd27, %rd4;\n\tadd.s32 \t%r591, %r591, -1;\n\tadd.s32 \t%r590, %r590, -16;\n\tsetp.ne.s32 \t%p5, %r591, 0;\n\t@%p5 bra \t$L__BB0_2;\n// %bb.3: // %._crit_edge.loopexit\n\t.loc\t1 70 23 // test_complex_kernels.py:70:23\n\tcvt.rn.f16.f32 \t%rs25, %r592;\n\tcvt.rn.f16.f32 \t%rs26, %r593;\n\tmov.b32 \t%r608, {%rs25, %rs26};\n\tcvt.rn.f16.f32 \t%rs27, %r594;\n\tcvt.rn.f16.f32 \t%rs28, %r595;\n\tmov.b32 \t%r609, {%rs27, %rs28};\n\tcvt.rn.f16.f32 \t%rs29, %r596;\n\tcvt.rn.f16.f32 \t%rs30, %r597;\n\tmov.b32 \t%r610, {%rs29, %rs30};\n\tcvt.rn.f16.f32 \t%rs31, %r598;\n\tcvt.rn.f16.f32 \t%rs32, %r599;\n\tmov.b32 \t%r611, {%rs31, %rs32};\n\tcvt.rn.f16.f32 \t%rs33, %r600;\n\tcvt.rn.f16.f32 \t%rs34, %r601;\n\tmov.b32 \t%r612, {%rs33, %rs34};\n\tcvt.rn.f16.f32 \t%rs35, %r602;\n\tcvt.rn.f16.f32 \t%rs36, %r603;\n\tmov.b32 \t%r613, {%rs35, %rs36};\n\tcvt.rn.f16.f32 \t%rs37, %r604;\n\tcvt.rn.f16.f32 \t%rs38, %r605;\n\tmov.b32 \t%r614, {%rs37, %rs38};\n\tcvt.rn.f16.f32 \t%rs39, %r606;\n\tcvt.rn.f16.f32 \t%rs40, %r607;\n\tmov.b32 \t%r615, {%rs39, %rs40};\n$L__BB0_4: // %._crit_edge\n\t.loc\t1 74 33 // test_complex_kernels.py:74:33\n\tmul.lo.s32 \t%r558, %r3, %r69;\n\tshl.b32 \t%r559, %r69, 4;\n\tadd.s32 \t%r560, %r558, %r559;\n\t.loc\t1 74 21 // test_complex_kernels.py:74:21\n\tmul.wide.s32 \t%rd22, %r558, 2;\n\tadd.s64 \t%rd23, %rd13, %rd22;\n\tmul.wide.s32 \t%rd24, %r560, 2;\n\tadd.s64 \t%rd25, %rd13, %rd24;\n\t.loc\t1 74 52 // test_complex_kernels.py:74:52\n\tmul.wide.s32 \t%rd26, %r7, 2;\n\tadd.s64 \t%rd20, %rd23, %rd26;\n\tadd.s64 \t%rd21, %rd25, %rd26;\n\t.loc\t1 75 33 // test_complex_kernels.py:75:33\n\tsetp.lt.s32 \t%p8, %r3, %r64;\n\tsetp.lt.s32 \t%p9, %r4, %r64;\n\t.loc\t1 75 58 // test_complex_kernels.py:75:58\n\tsetp.lt.s32 \t%p10, %r7, %r65;\n\t.loc\t1 75 39 // test_complex_kernels.py:75:39\n\tand.pred \t%p6, %p8, %p10;\n\tand.pred \t%p7, %p9, %p10;\n\t.loc\t1 76 21 // test_complex_kernels.py:76:21\n\tbar.sync \t0;\n\tand.b32 \t%r561, %r1, 2;\n\tshl.b32 \t%r562, %r561, 5;\n\tshl.b32 \t%r563, %r1, 3;\n\tand.b32 \t%r564, %r563, 136;\n\tor.b32 \t%r565, %r562, %r564;\n\tshl.b32 \t%r566, %r1, 2;\n\tand.b32 \t%r567, %r566, 48;\n\tor.b32 \t%r568, %r565, %r567;\n\tadd.s32 \t%r570, %r589, %r568;\n\tmov.b32 \t{%rs41, %rs42}, %r608;\n\tmov.b32 \t{%rs43, %rs44}, %r610;\n\tmov.b32 \t{%rs45, %rs46}, %r612;\n\tmov.b32 \t{%rs47, %rs48}, %r614;\n\tst.shared.v4.b16 \t[%r570], {%rs41, %rs43, %rs45, %rs47};\n\txor.b32 \t%r571, %r568, 288;\n\tadd.s32 \t%r572, %r589, %r571;\n\tst.shared.v4.b16 \t[%r572], {%rs42, %rs44, %rs46, %rs48};\n\txor.b32 \t%r573, %r568, 520;\n\tadd.s32 \t%r574, %r589, %r573;\n\tmov.b32 \t{%rs49, %rs50}, %r609;\n\tmov.b32 \t{%rs51, %rs52}, %r611;\n\tmov.b32 \t{%rs53, %rs54}, %r613;\n\tmov.b32 \t{%rs55, %rs56}, %r615;\n\tst.shared.v4.b16 \t[%r574], {%rs49, %rs51, %rs53, %rs55};\n\txor.b32 \t%r575, %r568, 808;\n\tadd.s32 \t%r576, %r589, %r575;\n\tst.shared.v4.b16 \t[%r576], {%rs50, %rs52, %rs54, %rs56};\n\tbar.sync \t0;\n\tshl.b32 \t%r577, %r1, 7;\n\tand.b32 \t%r578, %r577, 768;\n\tshl.b32 \t%r579, %r5, 6;\n\tor.b32 \t%r580, %r578, %r579;\n\tshl.b32 \t%r581, %r561, 4;\n\tor.b32 \t%r582, %r580, %r581;\n\tshl.b32 \t%r583, %r1, 1;\n\tand.b32 \t%r584, %r583, 56;\n\txor.b32 \t%r585, %r582, %r584;\n\tadd.s32 \t%r586, %r589, %r585;\n\tld.shared.v2.b32 \t{%r554, %r555}, [%r586+128];\n\txor.b32 \t%r587, %r585, 8;\n\tadd.s32 \t%r588, %r589, %r587;\n\tld.shared.v2.b32 \t{%r556, %r557}, [%r588+128];\n\tld.shared.v2.b32 \t{%r550, %r551}, [%r586];\n\tld.shared.v2.b32 \t{%r552, %r553}, [%r588];\n\t// begin inline asm\n\t@%p6 st.global.v4.b32 [ %rd20 + 0 ], { %r550, %r551, %r552, %r553 };\n\t// end inline asm\n\t// begin inline asm\n\t@%p7 st.global.v4.b32 [ %rd21 + 0 ], { %r554, %r555, %r556, %r557 };\n\t// end inline asm\n\t.loc\t1 76 4 // test_complex_kernels.py:76:4\n\tret;\n$L__tmp6:\n$L__func_end0:\n // -- End function\n}\n\t.file\t1 \"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\"\n\t.file\t2 \"/scratch/findhao/pta/triton/python/triton/language/standard.py\"\n\t.section\t.debug_abbrev\n\t{\n.b8 1 // Abbreviation Code\n.b8 17 // DW_TAG_compile_unit\n.b8 1 // DW_CHILDREN_yes\n.b8 37 // DW_AT_producer\n.b8 8 // DW_FORM_string\n.b8 19 // DW_AT_language\n.b8 5 // DW_FORM_data2\n.b8 3 // DW_AT_name\n.b8 8 // DW_FORM_string\n.b8 16 // DW_AT_stmt_list\n.b8 6 // DW_FORM_data4\n.b8 27 // DW_AT_comp_dir\n.b8 8 // DW_FORM_string\n.b8 0 // EOM(1)\n.b8 0 // EOM(2)\n.b8 2 // Abbreviation Code\n.b8 46 // DW_TAG_subprogram\n.b8 0 // DW_CHILDREN_no\n.b8 3 // DW_AT_name\n.b8 8 // DW_FORM_string\n.b8 32 // DW_AT_inline\n.b8 11 // DW_FORM_data1\n.b8 0 // EOM(1)\n.b8 0 // EOM(2)\n.b8 3 // Abbreviation Code\n.b8 46 // DW_TAG_subprogram\n.b8 1 // DW_CHILDREN_yes\n.b8 17 // DW_AT_low_pc\n.b8 1 // DW_FORM_addr\n.b8 18 // DW_AT_high_pc\n.b8 1 // DW_FORM_addr\n.b8 49 // DW_AT_abstract_origin\n.b8 19 // DW_FORM_ref4\n.b8 0 // EOM(1)\n.b8 0 // EOM(2)\n.b8 4 // Abbreviation Code\n.b8 29 // DW_TAG_inlined_subroutine\n.b8 0 // DW_CHILDREN_no\n.b8 49 // DW_AT_abstract_origin\n.b8 19 // DW_FORM_ref4\n.b8 17 // DW_AT_low_pc\n.b8 1 // DW_FORM_addr\n.b8 18 // DW_AT_high_pc\n.b8 1 // DW_FORM_addr\n.b8 88 // DW_AT_call_file\n.b8 11 // DW_FORM_data1\n.b8 89 // DW_AT_call_line\n.b8 11 // DW_FORM_data1\n.b8 87 // DW_AT_call_column\n.b8 11 // DW_FORM_data1\n.b8 0 // EOM(1)\n.b8 0 // EOM(2)\n.b8 0 // EOM(3)\n\t}\n\t.section\t.debug_info\n\t{\n.b32 191 // Length of Unit\n.b8 2 // DWARF version number\n.b8 0\n.b32 .debug_abbrev // Offset Into Abbrev. Section\n.b8 8 // Address Size (in bytes)\n.b8 1 // Abbrev [1] 0xb:0xb8 DW_TAG_compile_unit\n.b8 116 // DW_AT_producer\n.b8 114\n.b8 105\n.b8 116\n.b8 111\n.b8 110\n.b8 0\n.b8 2 // DW_AT_language\n.b8 0\n.b8 116 // DW_AT_name\n.b8 101\n.b8 115\n.b8 116\n.b8 95\n.b8 99\n.b8 111\n.b8 109\n.b8 112\n.b8 108\n.b8 101\n.b8 120\n.b8 95\n.b8 107\n.b8 101\n.b8 114\n.b8 110\n.b8 101\n.b8 108\n.b8 115\n.b8 46\n.b8 112\n.b8 121\n.b8 0\n.b32 .debug_line // DW_AT_stmt_list\n.b8 47 // DW_AT_comp_dir\n.b8 115\n.b8 99\n.b8 114\n.b8 97\n.b8 116\n.b8 99\n.b8 104\n.b8 47\n.b8 102\n.b8 105\n.b8 110\n.b8 100\n.b8 104\n.b8 97\n.b8 111\n.b8 47\n.b8 116\n.b8 114\n.b8 105\n.b8 116\n.b8 111\n.b8 110\n.b8 112\n.b8 97\n.b8 114\n.b8 115\n.b8 101\n.b8 47\n.b8 116\n.b8 101\n.b8 115\n.b8 116\n.b8 115\n.b8 0\n.b8 2 // Abbrev [2] 0x54:0x10 DW_TAG_subprogram\n.b8 109 // DW_AT_name\n.b8 97\n.b8 116\n.b8 109\n.b8 117\n.b8 108\n.b8 95\n.b8 107\n.b8 101\n.b8 114\n.b8 110\n.b8 101\n.b8 108\n.b8 0\n.b8 1 // DW_AT_inline\n.b8 3 // Abbrev [3] 0x64:0x5e DW_TAG_subprogram\n.b64 $L__func_begin0 // DW_AT_low_pc\n.b64 $L__func_end0 // DW_AT_high_pc\n.b32 84 // DW_AT_abstract_origin\n.b8 4 // Abbrev [4] 0x79:0x18 DW_TAG_inlined_subroutine\n.b32 84 // DW_AT_abstract_origin\n.b64 $L__tmp1 // DW_AT_low_pc\n.b64 $L__tmp2 // DW_AT_high_pc\n.b8 1 // DW_AT_call_file\n.b8 48 // DW_AT_call_line\n.b8 27 // DW_AT_call_column\n.b8 4 // Abbrev [4] 0x91:0x18 DW_TAG_inlined_subroutine\n.b32 84 // DW_AT_abstract_origin\n.b64 $L__tmp2 // DW_AT_low_pc\n.b64 $L__tmp3 // DW_AT_high_pc\n.b8 1 // DW_AT_call_file\n.b8 49 // DW_AT_call_line\n.b8 27 // DW_AT_call_column\n.b8 4 // Abbrev [4] 0xa9:0x18 DW_TAG_inlined_subroutine\n.b32 84 // DW_AT_abstract_origin\n.b64 $L__tmp4 // DW_AT_low_pc\n.b64 $L__tmp5 // DW_AT_high_pc\n.b8 1 // DW_AT_call_file\n.b8 64 // DW_AT_call_line\n.b8 33 // DW_AT_call_column\n.b8 0 // End Of Children Mark\n.b8 0 // End Of Children Mark\n\t}\n\t.section\t.debug_macinfo\t{\t}\n","matmul_kernel.json":"{\"hash\": \"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7\", \"target\": {\"backend\": \"cuda\", \"arch\": 75, \"warp_size\": 32}, \"num_warps\": 1, \"num_ctas\": 1, \"num_stages\": 1, \"warp_size\": 32, \"maxnreg\": null, \"cluster_dims\": [1, 1, 1], \"ptx_version\": null, \"ptx_options\": null, \"ir_override\": null, \"enable_fp_fusion\": true, \"launch_cooperative_grid\": false, \"launch_pdl\": false, \"supported_fp8_dtypes\": [\"fp8e4b15\", \"fp8e5\"], \"deprecated_fp8_dot_operand_dtypes\": [], \"default_dot_input_precision\": \"tf32\", \"allowed_dot_input_precisions\": [\"tf32\", \"tf32x3\", \"ieee\"], \"max_num_imprecise_acc_default\": 0, \"extern_libs\": [[\"libdevice\", \"/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc\"]], \"debug\": false, \"backend_name\": \"cuda\", \"sanitize_overflow\": true, \"arch\": \"sm75\", \"triton_version\": \"3.4.0\", \"tensordesc_meta\": [], \"shared\": 2048, \"tmem_size\": 0, \"global_scratch_size\": 0, \"global_scratch_align\": 1, \"name\": \"matmul_kernel\"}"},"python_source":{"file_path":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","start_line":29,"end_line":77,"code":"@triton.autotune(\n configs=[\n triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 1}, num_stages=1, num_warps=1),\n triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 1}, num_stages=1, num_warps=1),\n triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 1}, num_stages=1, num_warps=1),\n ],\n key=['M', 'N', 'K'],\n)\n@triton.jit\ndef matmul_kernel(\n a_ptr, b_ptr, c_ptr,\n M, N, K,\n stride_am, stride_ak,\n stride_bk, stride_bn,\n stride_cm, stride_cn,\n BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,\n GROUP_SIZE_M: tl.constexpr,\n):\n pid = tl.program_id(axis=0)\n num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)\n num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)\n num_pid_in_group = GROUP_SIZE_M * num_pid_n\n group_id = pid // num_pid_in_group\n first_pid_m = group_id * GROUP_SIZE_M\n group_size = min(num_pid_m - first_pid_m, GROUP_SIZE_M)\n pid_m = first_pid_m + (pid % group_size)\n pid_n = (pid % num_pid_in_group) // group_size\n\n offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M\n offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N\n offs_k = tl.arange(0, BLOCK_SIZE_K)\n a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)\n b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)\n\n accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)\n for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):\n a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)\n b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)\n accumulator += tl.dot(a, b)\n a_ptrs += BLOCK_SIZE_K * stride_ak\n b_ptrs += BLOCK_SIZE_K * stride_bk\n c = accumulator.to(tl.float16)\n\n offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)\n offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)\n c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]\n c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)\n tl.store(c_ptrs, c, mask=c_mask)\n"},"times":{"ir_initialization":927,"lowering_stages":[],"store_results":0}}} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":null,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":149,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:05.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"compilation","pid":171439,"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":149,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":593,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel = self._do_compile(key, signature, device, constexprs, options, attrs, warmup)"},{"line":773,"name":"_do_compile","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel = self.compile(src, target=target, options=options.__dict__)"},{"line":267,"name":"compile","filename":"/scratch/findhao/pta/triton/python/triton/compiler/compiler.py","loc":"compilation_listener("},{"line":752,"name":"maybe_trace_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton("},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ","payload":{"metadata":{"cache_hit":true,"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32,"env":{},"src_attrs":{"(0,)":[["tt.divisibility",16]],"(1,)":[["tt.divisibility",16]],"(2,)":[["tt.divisibility",16]],"(3,)":[["tt.divisibility",16]],"(4,)":[["tt.divisibility",16]],"(5,)":[["tt.divisibility",16]],"(6,)":[["tt.divisibility",16]],"(8,)":[["tt.divisibility",16]],"(10,)":[["tt.divisibility",16]]},"src_cache_key":"5aec8bef23533ced7a4a2dea17fb314b1446b68a9ca72aa80e32caf75b768172","src_constants":{"(7,)":1,"(9,)":1,"(11,)":1,"(12,)":16,"(13,)":32,"(14,)":16,"(15,)":1}},"file_path":{"matmul_kernel.source":"/home/findhao/.triton/cache/4ZEOBSQEL7XDFD5PWMUHEIW63CH5KD6BS5N2O4XBYOFRJCSI3UTA/matmul_kernel.source","matmul_kernel.ttir":"/home/findhao/.triton/cache/4ZEOBSQEL7XDFD5PWMUHEIW63CH5KD6BS5N2O4XBYOFRJCSI3UTA/matmul_kernel.ttir","matmul_kernel.ttgir":"/home/findhao/.triton/cache/4ZEOBSQEL7XDFD5PWMUHEIW63CH5KD6BS5N2O4XBYOFRJCSI3UTA/matmul_kernel.ttgir","matmul_kernel.llir":"/home/findhao/.triton/cache/4ZEOBSQEL7XDFD5PWMUHEIW63CH5KD6BS5N2O4XBYOFRJCSI3UTA/matmul_kernel.llir","matmul_kernel.ptx":"/home/findhao/.triton/cache/4ZEOBSQEL7XDFD5PWMUHEIW63CH5KD6BS5N2O4XBYOFRJCSI3UTA/matmul_kernel.ptx","matmul_kernel.cubin":"/home/findhao/.triton/cache/4ZEOBSQEL7XDFD5PWMUHEIW63CH5KD6BS5N2O4XBYOFRJCSI3UTA/matmul_kernel.cubin","matmul_kernel.json":"/home/findhao/.triton/cache/4ZEOBSQEL7XDFD5PWMUHEIW63CH5KD6BS5N2O4XBYOFRJCSI3UTA/matmul_kernel.json"},"file_content":{"matmul_kernel.ttir":"#loc = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0)\nmodule {\n tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg1: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg2: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg3: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg4: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg5: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg6: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg7: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg8: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0)) attributes {noinline = false} {\n %c31_i32 = arith.constant 31 : i32 loc(#loc1)\n %c15_i32 = arith.constant 15 : i32 loc(#loc1)\n %cst = arith.constant dense<0.000000e+00> : tensor<16x32xf16> loc(#loc1)\n %cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16> loc(#loc1)\n %c0_i32 = arith.constant 0 : i32 loc(#loc1)\n %cst_1 = arith.constant dense<16> : tensor<16x16xi32> loc(#loc1)\n %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x32xf32> loc(#loc1)\n %c32_i32 = arith.constant 32 : i32 loc(#loc1)\n %c16_i32 = arith.constant 16 : i32 loc(#loc1)\n %c1_i32 = arith.constant 1 : i32 loc(#loc1)\n %0 = tt.get_program_id x : i32 loc(#loc2)\n %1 = arith.addi %arg3, %c15_i32 : i32 loc(#loc56)\n %2 = arith.divsi %1, %c16_i32 : i32 loc(#loc57)\n %3 = arith.addi %arg4, %c31_i32 : i32 loc(#loc58)\n %4 = arith.divsi %3, %c32_i32 : i32 loc(#loc59)\n %5 = arith.divsi %0, %4 : i32 loc(#loc7)\n %6 = arith.subi %2, %5 : i32 loc(#loc8)\n %7 = arith.minsi %6, %c1_i32 : i32 loc(#loc9)\n %8 = arith.remsi %0, %7 : i32 loc(#loc10)\n %9 = arith.addi %5, %8 : i32 loc(#loc11)\n %10 = arith.remsi %0, %4 : i32 loc(#loc12)\n %11 = arith.divsi %10, %7 : i32 loc(#loc13)\n %12 = arith.muli %9, %c16_i32 : i32 loc(#loc14)\n %13 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc15)\n %14 = tt.splat %12 : i32 -> tensor<16xi32> loc(#loc16)\n %15 = arith.addi %14, %13 : tensor<16xi32> loc(#loc16)\n %16 = tt.splat %arg3 : i32 -> tensor<16xi32> loc(#loc17)\n %17 = arith.remsi %15, %16 : tensor<16xi32> loc(#loc17)\n %18 = arith.muli %11, %c32_i32 : i32 loc(#loc18)\n %19 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc19)\n %20 = tt.splat %18 : i32 -> tensor<32xi32> loc(#loc20)\n %21 = arith.addi %20, %19 : tensor<32xi32> loc(#loc20)\n %22 = tt.splat %arg4 : i32 -> tensor<32xi32> loc(#loc21)\n %23 = arith.remsi %21, %22 : tensor<32xi32> loc(#loc21)\n %24 = tt.expand_dims %17 {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc22)\n %25 = tt.splat %arg6 : i32 -> tensor<16x1xi32> loc(#loc23)\n %26 = arith.muli %24, %25 : tensor<16x1xi32> loc(#loc23)\n %27 = tt.expand_dims %13 {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc24)\n %28 = tt.broadcast %26 : tensor<16x1xi32> -> tensor<16x16xi32> loc(#loc25)\n %29 = tt.broadcast %27 : tensor<1x16xi32> -> tensor<16x16xi32> loc(#loc25)\n %30 = arith.addi %28, %29 : tensor<16x16xi32> loc(#loc25)\n %31 = tt.splat %arg0 : !tt.ptr -> tensor<16x16x!tt.ptr> loc(#loc26)\n %32 = tt.addptr %31, %30 : tensor<16x16x!tt.ptr>, tensor<16x16xi32> loc(#loc26)\n %33 = tt.expand_dims %13 {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc27)\n %34 = tt.splat %arg7 : i32 -> tensor<16x1xi32> loc(#loc28)\n %35 = arith.muli %33, %34 : tensor<16x1xi32> loc(#loc28)\n %36 = tt.expand_dims %23 {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc29)\n %37 = tt.broadcast %35 : tensor<16x1xi32> -> tensor<16x32xi32> loc(#loc30)\n %38 = tt.broadcast %36 : tensor<1x32xi32> -> tensor<16x32xi32> loc(#loc30)\n %39 = arith.addi %37, %38 : tensor<16x32xi32> loc(#loc30)\n %40 = tt.splat %arg1 : !tt.ptr -> tensor<16x32x!tt.ptr> loc(#loc31)\n %41 = tt.addptr %40, %39 : tensor<16x32x!tt.ptr>, tensor<16x32xi32> loc(#loc31)\n %42 = arith.addi %arg5, %c15_i32 : i32 loc(#loc60)\n %43 = arith.divsi %42, %c16_i32 : i32 loc(#loc61)\n %44:3 = scf.for %arg9 = %c0_i32 to %43 step %c1_i32 iter_args(%arg10 = %32, %arg11 = %41, %arg12 = %cst_2) -> (tensor<16x16x!tt.ptr>, tensor<16x32x!tt.ptr>, tensor<16x32xf32>) : i32 {\n %62 = arith.muli %arg9, %c16_i32 : i32 loc(#loc34)\n %63 = arith.subi %arg5, %62 : i32 loc(#loc35)\n %64 = tt.splat %63 : i32 -> tensor<1x16xi32> loc(#loc36)\n %65 = arith.cmpi slt, %27, %64 : tensor<1x16xi32> loc(#loc36)\n %66 = tt.broadcast %65 : tensor<1x16xi1> -> tensor<16x16xi1> loc(#loc37)\n %67 = tt.load %arg10, %66, %cst_0 : tensor<16x16x!tt.ptr> loc(#loc37)\n %68 = tt.splat %63 : i32 -> tensor<16x1xi32> loc(#loc38)\n %69 = arith.cmpi slt, %33, %68 : tensor<16x1xi32> loc(#loc38)\n %70 = tt.broadcast %69 : tensor<16x1xi1> -> tensor<16x32xi1> loc(#loc39)\n %71 = tt.load %arg11, %70, %cst : tensor<16x32x!tt.ptr> loc(#loc39)\n %72 = tt.dot %67, %71, %arg12, inputPrecision = tf32 : tensor<16x16xf16> * tensor<16x32xf16> -> tensor<16x32xf32> loc(#loc40)\n %73 = tt.addptr %arg10, %cst_1 : tensor<16x16x!tt.ptr>, tensor<16x16xi32> loc(#loc41)\n %74 = arith.muli %arg7, %c16_i32 : i32 loc(#loc42)\n %75 = tt.splat %74 : i32 -> tensor<16x32xi32> loc(#loc43)\n %76 = tt.addptr %arg11, %75 : tensor<16x32x!tt.ptr>, tensor<16x32xi32> loc(#loc43)\n scf.yield %73, %76, %72 : tensor<16x16x!tt.ptr>, tensor<16x32x!tt.ptr>, tensor<16x32xf32> loc(#loc44)\n } loc(#loc33)\n %45 = arith.truncf %44#2 : tensor<16x32xf32> to tensor<16x32xf16> loc(#loc45)\n %46 = tt.expand_dims %15 {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc46)\n %47 = tt.splat %arg8 : i32 -> tensor<16x1xi32> loc(#loc47)\n %48 = arith.muli %47, %46 : tensor<16x1xi32> loc(#loc47)\n %49 = tt.splat %arg2 : !tt.ptr -> tensor<16x1x!tt.ptr> loc(#loc48)\n %50 = tt.addptr %49, %48 : tensor<16x1x!tt.ptr>, tensor<16x1xi32> loc(#loc48)\n %51 = tt.expand_dims %21 {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc49)\n %52 = tt.broadcast %50 : tensor<16x1x!tt.ptr> -> tensor<16x32x!tt.ptr> loc(#loc50)\n %53 = tt.broadcast %51 : tensor<1x32xi32> -> tensor<16x32xi32> loc(#loc50)\n %54 = tt.addptr %52, %53 : tensor<16x32x!tt.ptr>, tensor<16x32xi32> loc(#loc50)\n %55 = tt.splat %arg3 : i32 -> tensor<16x1xi32> loc(#loc51)\n %56 = arith.cmpi slt, %46, %55 : tensor<16x1xi32> loc(#loc51)\n %57 = tt.splat %arg4 : i32 -> tensor<1x32xi32> loc(#loc52)\n %58 = arith.cmpi slt, %51, %57 : tensor<1x32xi32> loc(#loc52)\n %59 = tt.broadcast %56 : tensor<16x1xi1> -> tensor<16x32xi1> loc(#loc53)\n %60 = tt.broadcast %58 : tensor<1x32xi1> -> tensor<16x32xi1> loc(#loc53)\n %61 = arith.andi %59, %60 : tensor<16x32xi1> loc(#loc53)\n tt.store %54, %45, %61 : tensor<16x32x!tt.ptr> loc(#loc54)\n tt.return loc(#loc55)\n } loc(#loc)\n} loc(#loc)\n#loc1 = loc(unknown)\n#loc2 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":47:24)\n#loc3 = loc(\"/scratch/findhao/pta/triton/python/triton/language/standard.py\":40:22)\n#loc4 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":48:27)\n#loc5 = loc(\"/scratch/findhao/pta/triton/python/triton/language/standard.py\":40:28)\n#loc6 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":49:27)\n#loc7 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":51:22)\n#loc8 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":53:33)\n#loc9 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":53:46)\n#loc10 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":54:33)\n#loc11 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":54:27)\n#loc12 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":55:19)\n#loc13 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":55:40)\n#loc14 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:23)\n#loc15 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:51)\n#loc16 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:38)\n#loc17 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:68)\n#loc18 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:23)\n#loc19 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:51)\n#loc20 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:38)\n#loc21 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:68)\n#loc22 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:30)\n#loc23 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:41)\n#loc24 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:60)\n#loc25 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:53)\n#loc26 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:22)\n#loc27 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:29)\n#loc28 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:40)\n#loc29 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:60)\n#loc30 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:52)\n#loc31 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:22)\n#loc32 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":64:33)\n#loc33 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":64:22)\n#loc34 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:59)\n#loc35 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:55)\n#loc36 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:51)\n#loc37 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:20)\n#loc38 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":66:51)\n#loc39 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":66:20)\n#loc40 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":67:33)\n#loc41 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":68:18)\n#loc42 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:33)\n#loc43 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:18)\n#loc44 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:8)\n#loc45 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":70:23)\n#loc46 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:41)\n#loc47 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:33)\n#loc48 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:21)\n#loc49 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:72)\n#loc50 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:52)\n#loc51 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:33)\n#loc52 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:58)\n#loc53 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:39)\n#loc54 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":76:21)\n#loc55 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":76:4)\n#loc56 = loc(callsite(#loc3 at #loc4))\n#loc57 = loc(callsite(#loc5 at #loc4))\n#loc58 = loc(callsite(#loc3 at #loc6))\n#loc59 = loc(callsite(#loc5 at #loc6))\n#loc60 = loc(callsite(#loc3 at #loc32))\n#loc61 = loc(callsite(#loc5 at #loc32))\n","matmul_kernel.ttgir":"#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [16, 2], warpsPerCTA = [1, 1], order = [1, 0]}>\n#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0]}>\n#blocked2 = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [4, 8], warpsPerCTA = [1, 1], order = [1, 0]}>\n#loc = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0)\n#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>\n#smem = #ttg.shared_memory\nmodule attributes {\"ttg.num-ctas\" = 1 : i32, \"ttg.num-warps\" = 1 : i32, ttg.target = \"cuda:75\", \"ttg.threads-per-warp\" = 32 : i32} {\n tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg1: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg2: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg3: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg4: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg5: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg6: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg7: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0), %arg8: i32 {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":38:0)) attributes {noinline = false} {\n %cst = arith.constant dense<16> : tensor<16x16xi32, #blocked> loc(#loc1)\n %c0_i32 = arith.constant 0 : i32 loc(#loc1)\n %c15_i32 = arith.constant 15 : i32 loc(#loc1)\n %c31_i32 = arith.constant 31 : i32 loc(#loc1)\n %c32_i32 = arith.constant 32 : i32 loc(#loc1)\n %c16_i32 = arith.constant 16 : i32 loc(#loc1)\n %c1_i32 = arith.constant 1 : i32 loc(#loc1)\n %cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #blocked> loc(#loc1)\n %cst_1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #blocked1> loc(#loc1)\n %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x32xf32, #blocked2> loc(#loc1)\n %0 = tt.get_program_id x : i32 loc(#loc2)\n %1 = arith.addi %arg3, %c15_i32 : i32 loc(#loc56)\n %2 = arith.divsi %1, %c16_i32 : i32 loc(#loc57)\n %3 = arith.addi %arg4, %c31_i32 : i32 loc(#loc58)\n %4 = arith.divsi %3, %c32_i32 : i32 loc(#loc59)\n %5 = arith.divsi %0, %4 : i32 loc(#loc7)\n %6 = arith.subi %2, %5 : i32 loc(#loc8)\n %7 = arith.minsi %6, %c1_i32 : i32 loc(#loc9)\n %8 = arith.remsi %0, %7 : i32 loc(#loc10)\n %9 = arith.addi %5, %8 : i32 loc(#loc11)\n %10 = arith.remsi %0, %4 : i32 loc(#loc12)\n %11 = arith.divsi %10, %7 : i32 loc(#loc13)\n %12 = arith.muli %9, %c16_i32 : i32 loc(#loc14)\n %13 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc15)\n %14 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc15)\n %15 = tt.splat %12 : i32 -> tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc16)\n %16 = tt.splat %12 : i32 -> tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc16)\n %17 = arith.addi %15, %13 : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc16)\n %18 = arith.addi %16, %14 : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc16)\n %19 = tt.splat %arg3 : i32 -> tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc17)\n %20 = arith.remsi %17, %19 : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc17)\n %21 = arith.muli %11, %c32_i32 : i32 loc(#loc18)\n %22 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc19)\n %23 = tt.splat %21 : i32 -> tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc20)\n %24 = arith.addi %23, %22 : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc20)\n %25 = tt.splat %arg4 : i32 -> tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc21)\n %26 = arith.remsi %24, %25 : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc21)\n %27 = tt.expand_dims %20 {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> loc(#loc22)\n %28 = tt.splat %arg6 : i32 -> tensor<16x1xi32, #blocked> loc(#loc23)\n %29 = arith.muli %27, %28 : tensor<16x1xi32, #blocked> loc(#loc23)\n %30 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc24)\n %31 = tt.expand_dims %30 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc24)\n %32 = tt.broadcast %29 : tensor<16x1xi32, #blocked> -> tensor<16x16xi32, #blocked> loc(#loc25)\n %33 = tt.broadcast %31 : tensor<1x16xi32, #blocked> -> tensor<16x16xi32, #blocked> loc(#loc25)\n %34 = arith.addi %32, %33 : tensor<16x16xi32, #blocked> loc(#loc25)\n %35 = tt.splat %arg0 : !tt.ptr -> tensor<16x16x!tt.ptr, #blocked> loc(#loc26)\n %36 = tt.addptr %35, %34 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> loc(#loc26)\n %37 = tt.expand_dims %14 {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<16x1xi32, #blocked1> loc(#loc27)\n %38 = tt.splat %arg7 : i32 -> tensor<16x1xi32, #blocked1> loc(#loc28)\n %39 = arith.muli %37, %38 : tensor<16x1xi32, #blocked1> loc(#loc28)\n %40 = tt.expand_dims %26 {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> loc(#loc29)\n %41 = tt.broadcast %39 : tensor<16x1xi32, #blocked1> -> tensor<16x32xi32, #blocked1> loc(#loc30)\n %42 = tt.broadcast %40 : tensor<1x32xi32, #blocked1> -> tensor<16x32xi32, #blocked1> loc(#loc30)\n %43 = arith.addi %41, %42 : tensor<16x32xi32, #blocked1> loc(#loc30)\n %44 = tt.splat %arg1 : !tt.ptr -> tensor<16x32x!tt.ptr, #blocked1> loc(#loc31)\n %45 = tt.addptr %44, %43 : tensor<16x32x!tt.ptr, #blocked1>, tensor<16x32xi32, #blocked1> loc(#loc31)\n %46 = arith.addi %arg5, %c15_i32 : i32 loc(#loc60)\n %47 = arith.divsi %46, %c16_i32 : i32 loc(#loc61)\n %48 = arith.muli %arg7, %c16_i32 : i32 loc(#loc33)\n %49 = tt.splat %48 : i32 -> tensor<16x32xi32, #blocked1> loc(#loc34)\n %50:3 = scf.for %arg9 = %c0_i32 to %47 step %c1_i32 iter_args(%arg10 = %cst_2, %arg11 = %36, %arg12 = %45) -> (tensor<16x32xf32, #blocked2>, tensor<16x16x!tt.ptr, #blocked>, tensor<16x32x!tt.ptr, #blocked1>) : i32 {\n %69 = arith.muli %arg9, %c16_i32 : i32 loc(#loc36)\n %70 = arith.subi %arg5, %69 : i32 loc(#loc37)\n %71 = tt.splat %70 : i32 -> tensor<1x16xi32, #blocked> loc(#loc38)\n %72 = arith.cmpi slt, %31, %71 : tensor<1x16xi32, #blocked> loc(#loc38)\n %73 = tt.broadcast %72 : tensor<1x16xi1, #blocked> -> tensor<16x16xi1, #blocked> loc(#loc39)\n %74 = tt.load %arg11, %73, %cst_0 : tensor<16x16x!tt.ptr, #blocked> loc(#loc39)\n %75 = tt.splat %70 : i32 -> tensor<16x1xi32, #blocked1> loc(#loc40)\n %76 = arith.cmpi slt, %37, %75 : tensor<16x1xi32, #blocked1> loc(#loc40)\n %77 = tt.broadcast %76 : tensor<16x1xi1, #blocked1> -> tensor<16x32xi1, #blocked1> loc(#loc41)\n %78 = tt.load %arg12, %77, %cst_1 : tensor<16x32x!tt.ptr, #blocked1> loc(#loc41)\n %79 = arith.extf %74 : tensor<16x16xf16, #blocked> to tensor<16x16xf32, #blocked> loc(#loc42)\n %80 = ttg.local_alloc %79 : (tensor<16x16xf32, #blocked>) -> !ttg.memdesc<16x16xf32, #shared, #smem> loc(#loc42)\n %81 = ttg.local_load %80 : !ttg.memdesc<16x16xf32, #shared, #smem> -> tensor<16x16xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> loc(#loc42)\n %82 = arith.extf %78 : tensor<16x32xf16, #blocked1> to tensor<16x32xf32, #blocked1> loc(#loc42)\n %83 = ttg.local_alloc %82 : (tensor<16x32xf32, #blocked1>) -> !ttg.memdesc<16x32xf32, #shared, #smem> loc(#loc42)\n %84 = ttg.local_load %83 : !ttg.memdesc<16x32xf32, #shared, #smem> -> tensor<16x32xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked2}>> loc(#loc42)\n %85 = tt.dot %81, %84, %arg10, inputPrecision = tf32 : tensor<16x16xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> * tensor<16x32xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked2}>> -> tensor<16x32xf32, #blocked2> loc(#loc42)\n %86 = tt.addptr %arg11, %cst : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> loc(#loc43)\n %87 = tt.addptr %arg12, %49 : tensor<16x32x!tt.ptr, #blocked1>, tensor<16x32xi32, #blocked1> loc(#loc34)\n scf.yield %85, %86, %87 : tensor<16x32xf32, #blocked2>, tensor<16x16x!tt.ptr, #blocked>, tensor<16x32x!tt.ptr, #blocked1> loc(#loc44)\n } loc(#loc35)\n %51 = arith.truncf %50#0 : tensor<16x32xf32, #blocked2> to tensor<16x32xf16, #blocked2> loc(#loc45)\n %52 = tt.expand_dims %18 {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<16x1xi32, #blocked1> loc(#loc46)\n %53 = tt.splat %arg8 : i32 -> tensor<16x1xi32, #blocked1> loc(#loc47)\n %54 = arith.muli %53, %52 : tensor<16x1xi32, #blocked1> loc(#loc47)\n %55 = tt.splat %arg2 : !tt.ptr -> tensor<16x1x!tt.ptr, #blocked1> loc(#loc48)\n %56 = tt.addptr %55, %54 : tensor<16x1x!tt.ptr, #blocked1>, tensor<16x1xi32, #blocked1> loc(#loc48)\n %57 = tt.expand_dims %24 {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> loc(#loc49)\n %58 = tt.broadcast %56 : tensor<16x1x!tt.ptr, #blocked1> -> tensor<16x32x!tt.ptr, #blocked1> loc(#loc50)\n %59 = tt.broadcast %57 : tensor<1x32xi32, #blocked1> -> tensor<16x32xi32, #blocked1> loc(#loc50)\n %60 = tt.addptr %58, %59 : tensor<16x32x!tt.ptr, #blocked1>, tensor<16x32xi32, #blocked1> loc(#loc50)\n %61 = tt.splat %arg3 : i32 -> tensor<16x1xi32, #blocked1> loc(#loc51)\n %62 = arith.cmpi slt, %52, %61 : tensor<16x1xi32, #blocked1> loc(#loc51)\n %63 = tt.splat %arg4 : i32 -> tensor<1x32xi32, #blocked1> loc(#loc52)\n %64 = arith.cmpi slt, %57, %63 : tensor<1x32xi32, #blocked1> loc(#loc52)\n %65 = tt.broadcast %62 : tensor<16x1xi1, #blocked1> -> tensor<16x32xi1, #blocked1> loc(#loc53)\n %66 = tt.broadcast %64 : tensor<1x32xi1, #blocked1> -> tensor<16x32xi1, #blocked1> loc(#loc53)\n %67 = arith.andi %65, %66 : tensor<16x32xi1, #blocked1> loc(#loc53)\n %68 = ttg.convert_layout %51 : tensor<16x32xf16, #blocked2> -> tensor<16x32xf16, #blocked1> loc(#loc54)\n tt.store %60, %68, %67 : tensor<16x32x!tt.ptr, #blocked1> loc(#loc54)\n tt.return loc(#loc55)\n } loc(#loc)\n} loc(#loc)\n#loc1 = loc(unknown)\n#loc2 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":47:24)\n#loc3 = loc(\"/scratch/findhao/pta/triton/python/triton/language/standard.py\":40:22)\n#loc4 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":48:27)\n#loc5 = loc(\"/scratch/findhao/pta/triton/python/triton/language/standard.py\":40:28)\n#loc6 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":49:27)\n#loc7 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":51:22)\n#loc8 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":53:33)\n#loc9 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":53:46)\n#loc10 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":54:33)\n#loc11 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":54:27)\n#loc12 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":55:19)\n#loc13 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":55:40)\n#loc14 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:23)\n#loc15 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:51)\n#loc16 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:38)\n#loc17 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":57:68)\n#loc18 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:23)\n#loc19 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:51)\n#loc20 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:38)\n#loc21 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":58:68)\n#loc22 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:30)\n#loc23 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:41)\n#loc24 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:60)\n#loc25 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:53)\n#loc26 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":60:22)\n#loc27 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:29)\n#loc28 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:40)\n#loc29 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:60)\n#loc30 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:52)\n#loc31 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":61:22)\n#loc32 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":64:33)\n#loc33 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:33)\n#loc34 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:18)\n#loc35 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":64:22)\n#loc36 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:59)\n#loc37 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:55)\n#loc38 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:51)\n#loc39 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":65:20)\n#loc40 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":66:51)\n#loc41 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":66:20)\n#loc42 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":67:33)\n#loc43 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":68:18)\n#loc44 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":69:8)\n#loc45 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":70:23)\n#loc46 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:41)\n#loc47 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:33)\n#loc48 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:21)\n#loc49 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:72)\n#loc50 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":74:52)\n#loc51 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:33)\n#loc52 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:58)\n#loc53 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":75:39)\n#loc54 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":76:21)\n#loc55 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":76:4)\n#loc56 = loc(callsite(#loc3 at #loc4))\n#loc57 = loc(callsite(#loc5 at #loc4))\n#loc58 = loc(callsite(#loc3 at #loc6))\n#loc59 = loc(callsite(#loc5 at #loc6))\n#loc60 = loc(callsite(#loc3 at #loc32))\n#loc61 = loc(callsite(#loc5 at #loc32))\n","matmul_kernel.llir":"; ModuleID = 'LLVMDialectModule'\nsource_filename = \"LLVMDialectModule\"\ntarget datalayout = \"e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64\"\n\n@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16\n\ndefine ptx_kernel void @matmul_kernel(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9) local_unnamed_addr #0 !dbg !5 {\n %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8\n %12 = add i32 %3, 15, !dbg !9\n %13 = sdiv i32 %12, 16, !dbg !13\n %14 = add i32 %4, 31, !dbg !14\n %15 = sdiv i32 %14, 32, !dbg !16\n %.frozen = freeze i32 %15, !dbg !17\n %16 = sdiv i32 %11, %.frozen, !dbg !17\n %17 = sub i32 %13, %16, !dbg !18\n %18 = tail call i32 @llvm.smin.i32(i32 %17, i32 1), !dbg !19\n %19 = srem i32 %11, %18, !dbg !20\n %20 = add i32 %19, %16, !dbg !21\n %21 = mul i32 %16, %.frozen, !dbg !22\n %.decomposed = sub i32 %11, %21, !dbg !22\n %22 = sdiv i32 %.decomposed, %18, !dbg !23\n %23 = shl i32 %20, 4, !dbg !24\n %24 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !25\n %25 = lshr i32 %24, 2, !dbg !25\n %26 = and i32 %25, 7, !dbg !25\n %27 = or disjoint i32 %26, 8, !dbg !25\n %28 = shl nsw i32 %22, 5, !dbg !26\n %29 = and i32 %24, 3, !dbg !27\n %30 = shl nuw nsw i32 %29, 3, !dbg !27\n %31 = or disjoint i32 %28, %30, !dbg !28\n %32 = shl nuw nsw i32 %24, 3, !dbg !29\n %33 = and i32 %32, 8, !dbg !29\n %34 = add i32 %5, 15, !dbg !30\n %35 = sdiv i32 %34, 16, !dbg !32\n %36 = icmp sgt i32 %34, 15, !dbg !33\n br i1 %36, label %.lr.ph, label %.._crit_edge_crit_edge, !dbg !33\n\n.._crit_edge_crit_edge: ; preds = %10\n %.pre = shl nuw nsw i32 %24, 4, !dbg !34\n br label %._crit_edge, !dbg !33\n\n.lr.ph: ; preds = %10\n %37 = shl i32 %7, 4, !dbg !35\n %38 = srem i32 %31, %4, !dbg !36\n %39 = mul i32 %7, %27, !dbg !37\n %40 = add i32 %38, %39, !dbg !38\n %41 = sext i32 %40 to i64, !dbg !39\n %42 = getelementptr half, ptr addrspace(1) %1, i64 %41, !dbg !39\n %43 = mul i32 %7, %26, !dbg !37\n %44 = add i32 %38, %43, !dbg !38\n %45 = sext i32 %44 to i64, !dbg !39\n %46 = getelementptr half, ptr addrspace(1) %1, i64 %45, !dbg !39\n %47 = lshr i32 %24, 1, !dbg !25\n %48 = and i32 %47, 15, !dbg !25\n %49 = or disjoint i32 %23, %48, !dbg !40\n %50 = srem i32 %49, %3, !dbg !41\n %51 = mul i32 %50, %6, !dbg !42\n %52 = add i32 %51, %33, !dbg !43\n %53 = sext i32 %52 to i64, !dbg !44\n %54 = getelementptr half, ptr addrspace(1) %0, i64 %53, !dbg !44\n %55 = shl nuw nsw i32 %24, 5\n %56 = and i32 %55, 992\n %57 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %56\n %58 = getelementptr inbounds nuw i8, ptr addrspace(3) %57, i32 16\n %59 = and i32 %55, 768\n %60 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %59\n %61 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 16\n %62 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 64\n %63 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 80\n %64 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 128\n %65 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 144\n %66 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 192\n %67 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 208\n %68 = getelementptr inbounds nuw i8, ptr addrspace(3) %57, i32 1024\n %69 = getelementptr inbounds nuw i8, ptr addrspace(3) %57, i32 1040\n %70 = shl nuw nsw i32 %24, 4\n %71 = and i32 %70, 112\n %72 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %71\n %73 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 128\n %74 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 256\n %75 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 384\n %76 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 512\n %77 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 640\n %78 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 768\n %79 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 896\n %80 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 1024\n %81 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 1152\n %82 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 1280\n %83 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 1408\n %84 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 1536\n %85 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 1664\n %86 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 1792\n %87 = getelementptr inbounds nuw i8, ptr addrspace(3) %72, i32 1920\n %88 = sext i32 %37 to i64\n %89 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 24\n %90 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 40\n %91 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 56\n %92 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 88\n %93 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 104\n %94 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 120\n %95 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 152\n %96 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 168\n %97 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 184\n %98 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 216\n %99 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 232\n %100 = getelementptr inbounds nuw i8, ptr addrspace(3) %60, i32 248\n br label %101, !dbg !33\n\n101: ; preds = %.lr.ph, %101\n %.pn31206 = phi ptr addrspace(1) [ %42, %.lr.ph ], [ %278, %101 ]\n %.pn47205 = phi ptr addrspace(1) [ %46, %.lr.ph ], [ %277, %101 ]\n %.pn15204 = phi ptr addrspace(1) [ %54, %.lr.ph ], [ %276, %101 ]\n %102 = phi i32 [ 0, %.lr.ph ], [ %279, %101 ]\n %103 = phi <16 x float> [ zeroinitializer, %.lr.ph ], [ %275, %101 ]\n %104 = shufflevector <16 x float> %103, <16 x float> poison, <16 x i32> \n %105 = shl i32 %102, 4, !dbg !45\n %106 = sub i32 %5, %105, !dbg !46\n %107 = icmp slt i32 %33, %106, !dbg !47\n %108 = tail call { i32, i32, i32, i32 } asm sideeffect \"mov.u32 $0, $4;\\0A\\09mov.u32 $1, $5;\\0A\\09mov.u32 $2, $6;\\0A\\09mov.u32 $3, $7;\\0A\\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];\", \"=r,=r,=r,=r,r,r,r,r,l,b\"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %.pn15204, i1 %107) #4, !dbg !48\n %109 = extractvalue { i32, i32, i32, i32 } %108, 0, !dbg !48\n %110 = bitcast i32 %109 to <2 x half>, !dbg !48\n %111 = extractvalue { i32, i32, i32, i32 } %108, 1, !dbg !48\n %112 = bitcast i32 %111 to <2 x half>, !dbg !48\n %113 = extractvalue { i32, i32, i32, i32 } %108, 2, !dbg !48\n %114 = bitcast i32 %113 to <2 x half>, !dbg !48\n %115 = extractvalue { i32, i32, i32, i32 } %108, 3, !dbg !48\n %116 = bitcast i32 %115 to <2 x half>, !dbg !48\n %117 = icmp slt i32 %26, %106, !dbg !49\n %118 = icmp slt i32 %27, %106, !dbg !49\n %119 = tail call { i32, i32, i32, i32 } asm sideeffect \"mov.u32 $0, $4;\\0A\\09mov.u32 $1, $5;\\0A\\09mov.u32 $2, $6;\\0A\\09mov.u32 $3, $7;\\0A\\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];\", \"=r,=r,=r,=r,r,r,r,r,l,b\"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %.pn47205, i1 %117) #4, !dbg !50\n %120 = extractvalue { i32, i32, i32, i32 } %119, 0, !dbg !50\n %121 = bitcast i32 %120 to <2 x half>, !dbg !50\n %122 = extractvalue { i32, i32, i32, i32 } %119, 1, !dbg !50\n %123 = bitcast i32 %122 to <2 x half>, !dbg !50\n %124 = extractvalue { i32, i32, i32, i32 } %119, 2, !dbg !50\n %125 = bitcast i32 %124 to <2 x half>, !dbg !50\n %126 = extractvalue { i32, i32, i32, i32 } %119, 3, !dbg !50\n %127 = bitcast i32 %126 to <2 x half>, !dbg !50\n %128 = tail call { i32, i32, i32, i32 } asm sideeffect \"mov.u32 $0, $4;\\0A\\09mov.u32 $1, $5;\\0A\\09mov.u32 $2, $6;\\0A\\09mov.u32 $3, $7;\\0A\\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];\", \"=r,=r,=r,=r,r,r,r,r,l,b\"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %.pn31206, i1 %118) #4, !dbg !50\n %129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !50\n %130 = bitcast i32 %129 to <2 x half>, !dbg !50\n %131 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !50\n %132 = bitcast i32 %131 to <2 x half>, !dbg !50\n %133 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !50\n %134 = bitcast i32 %133 to <2 x half>, !dbg !50\n %135 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !50\n %136 = bitcast i32 %135 to <2 x half>, !dbg !50\n %137 = shufflevector <2 x half> %110, <2 x half> %112, <4 x i32> , !dbg !51\n %138 = fpext <4 x half> %137 to <4 x float>, !dbg !51\n %139 = shufflevector <2 x half> %114, <2 x half> %116, <4 x i32> , !dbg !51\n %140 = fpext <4 x half> %139 to <4 x float>, !dbg !51\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !51\n store <4 x float> %138, ptr addrspace(3) %57, align 16, !dbg !51\n store <4 x float> %140, ptr addrspace(3) %58, align 16, !dbg !51\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !51\n %141 = shufflevector <2 x half> %121, <2 x half> %123, <4 x i32> , !dbg !51\n %142 = fpext <4 x half> %141 to <4 x float>, !dbg !51\n %143 = shufflevector <2 x half> %125, <2 x half> %127, <4 x i32> , !dbg !51\n %144 = fpext <4 x half> %143 to <4 x float>, !dbg !51\n %145 = shufflevector <2 x half> %130, <2 x half> %132, <4 x i32> , !dbg !51\n %146 = fpext <4 x half> %145 to <4 x float>, !dbg !51\n %147 = shufflevector <2 x half> %134, <2 x half> %136, <4 x i32> , !dbg !51\n %148 = fpext <4 x half> %147 to <4 x float>, !dbg !51\n %149 = load <4 x float>, ptr addrspace(3) %89, align 8, !dbg !51\n %150 = load <4 x float>, ptr addrspace(3) %90, align 8, !dbg !51\n %151 = load <2 x float>, ptr addrspace(3) %91, align 8, !dbg !51\n %152 = load <4 x float>, ptr addrspace(3) %92, align 8, !dbg !51\n %153 = load <4 x float>, ptr addrspace(3) %93, align 8, !dbg !51\n %154 = load <2 x float>, ptr addrspace(3) %94, align 8, !dbg !51\n %155 = load <4 x float>, ptr addrspace(3) %95, align 8, !dbg !51\n %156 = load <4 x float>, ptr addrspace(3) %96, align 8, !dbg !51\n %157 = load <2 x float>, ptr addrspace(3) %97, align 8, !dbg !51\n %158 = load <4 x float>, ptr addrspace(3) %98, align 8, !dbg !51\n %159 = load <4 x float>, ptr addrspace(3) %99, align 8, !dbg !51\n %160 = load <2 x float>, ptr addrspace(3) %100, align 8, !dbg !51\n %161 = load <4 x float>, ptr addrspace(3) %60, align 16, !dbg !51\n %162 = load <2 x float>, ptr addrspace(3) %61, align 16, !dbg !51\n %163 = load <4 x float>, ptr addrspace(3) %62, align 16, !dbg !51\n %164 = load <2 x float>, ptr addrspace(3) %63, align 16, !dbg !51\n %165 = load <4 x float>, ptr addrspace(3) %64, align 16, !dbg !51\n %166 = load <2 x float>, ptr addrspace(3) %65, align 16, !dbg !51\n %167 = load <4 x float>, ptr addrspace(3) %66, align 16, !dbg !51\n %168 = load <2 x float>, ptr addrspace(3) %67, align 16, !dbg !51\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !51\n store <4 x float> %142, ptr addrspace(3) %57, align 16, !dbg !51\n store <4 x float> %144, ptr addrspace(3) %58, align 16, !dbg !51\n store <4 x float> %146, ptr addrspace(3) %68, align 16, !dbg !51\n store <4 x float> %148, ptr addrspace(3) %69, align 16, !dbg !51\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !51\n %169 = load <4 x float>, ptr addrspace(3) %72, align 16, !dbg !51\n %170 = shufflevector <4 x float> %169, <4 x float> poison, <16 x i32> , !dbg !51\n %171 = load <4 x float>, ptr addrspace(3) %73, align 16, !dbg !51\n %172 = shufflevector <4 x float> %171, <4 x float> poison, <16 x i32> , !dbg !51\n %173 = load <4 x float>, ptr addrspace(3) %74, align 16, !dbg !51\n %174 = shufflevector <4 x float> %173, <4 x float> poison, <16 x i32> , !dbg !51\n %175 = load <4 x float>, ptr addrspace(3) %75, align 16, !dbg !51\n %176 = shufflevector <4 x float> %175, <4 x float> poison, <16 x i32> , !dbg !51\n %177 = load <4 x float>, ptr addrspace(3) %76, align 16, !dbg !51\n %178 = shufflevector <4 x float> %177, <4 x float> poison, <16 x i32> , !dbg !51\n %179 = load <4 x float>, ptr addrspace(3) %77, align 16, !dbg !51\n %180 = shufflevector <4 x float> %179, <4 x float> poison, <16 x i32> , !dbg !51\n %181 = load <4 x float>, ptr addrspace(3) %78, align 16, !dbg !51\n %182 = shufflevector <4 x float> %181, <4 x float> poison, <16 x i32> , !dbg !51\n %183 = load <4 x float>, ptr addrspace(3) %79, align 16, !dbg !51\n %184 = shufflevector <4 x float> %183, <4 x float> poison, <16 x i32> , !dbg !51\n %185 = load <4 x float>, ptr addrspace(3) %80, align 16, !dbg !51\n %186 = shufflevector <4 x float> %185, <4 x float> poison, <16 x i32> , !dbg !51\n %187 = load <4 x float>, ptr addrspace(3) %81, align 16, !dbg !51\n %188 = shufflevector <4 x float> %187, <4 x float> poison, <16 x i32> , !dbg !51\n %189 = load <4 x float>, ptr addrspace(3) %82, align 16, !dbg !51\n %190 = shufflevector <4 x float> %189, <4 x float> poison, <16 x i32> , !dbg !51\n %191 = load <4 x float>, ptr addrspace(3) %83, align 16, !dbg !51\n %192 = shufflevector <4 x float> %191, <4 x float> poison, <16 x i32> , !dbg !51\n %193 = load <4 x float>, ptr addrspace(3) %84, align 16, !dbg !51\n %194 = shufflevector <4 x float> %193, <4 x float> poison, <16 x i32> , !dbg !51\n %195 = load <4 x float>, ptr addrspace(3) %85, align 16, !dbg !51\n %196 = shufflevector <4 x float> %195, <4 x float> poison, <16 x i32> , !dbg !51\n %197 = load <4 x float>, ptr addrspace(3) %86, align 16, !dbg !51\n %198 = shufflevector <4 x float> %197, <4 x float> poison, <16 x i32> , !dbg !51\n %199 = load <4 x float>, ptr addrspace(3) %87, align 16, !dbg !51\n %200 = shufflevector <4 x float> %199, <4 x float> poison, <16 x i32> , !dbg !51\n %201 = shufflevector <4 x float> %161, <4 x float> %163, <16 x i32> , !dbg !51\n %202 = shufflevector <4 x float> %165, <4 x float> poison, <16 x i32> , !dbg !51\n %203 = shufflevector <16 x float> %201, <16 x float> %202, <16 x i32> , !dbg !51\n %204 = shufflevector <4 x float> %167, <4 x float> poison, <16 x i32> , !dbg !51\n %205 = shufflevector <16 x float> %203, <16 x float> %204, <16 x i32> , !dbg !51\n %206 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %205, <16 x float> %170, <16 x float> %104), !dbg !51\n %207 = shufflevector <4 x float> %161, <4 x float> %163, <16 x i32> , !dbg !51\n %208 = shufflevector <16 x float> %207, <16 x float> %202, <16 x i32> , !dbg !51\n %209 = shufflevector <16 x float> %208, <16 x float> %204, <16 x i32> , !dbg !51\n %210 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %209, <16 x float> %172, <16 x float> %206), !dbg !51\n %211 = shufflevector <4 x float> %161, <4 x float> %163, <16 x i32> , !dbg !51\n %212 = shufflevector <16 x float> %211, <16 x float> %202, <16 x i32> , !dbg !51\n %213 = shufflevector <16 x float> %212, <16 x float> %204, <16 x i32> , !dbg !51\n %214 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %213, <16 x float> %174, <16 x float> %210), !dbg !51\n %215 = shufflevector <4 x float> %161, <4 x float> %163, <16 x i32> , !dbg !51\n %216 = shufflevector <16 x float> %215, <16 x float> %202, <16 x i32> , !dbg !51\n %217 = shufflevector <16 x float> %216, <16 x float> %204, <16 x i32> , !dbg !51\n %218 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %217, <16 x float> %176, <16 x float> %214), !dbg !51\n %219 = shufflevector <2 x float> %162, <2 x float> %164, <16 x i32> , !dbg !51\n %220 = shufflevector <2 x float> %166, <2 x float> poison, <16 x i32> , !dbg !51\n %221 = shufflevector <16 x float> %219, <16 x float> %220, <16 x i32> , !dbg !51\n %222 = shufflevector <2 x float> %168, <2 x float> poison, <16 x i32> , !dbg !51\n %223 = shufflevector <16 x float> %221, <16 x float> %222, <16 x i32> , !dbg !51\n %224 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %223, <16 x float> %178, <16 x float> %218), !dbg !51\n %225 = shufflevector <2 x float> %162, <2 x float> %164, <16 x i32> , !dbg !51\n %226 = shufflevector <16 x float> %225, <16 x float> %220, <16 x i32> , !dbg !51\n %227 = shufflevector <16 x float> %226, <16 x float> %222, <16 x i32> , !dbg !51\n %228 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %227, <16 x float> %180, <16 x float> %224), !dbg !51\n %229 = shufflevector <4 x float> %149, <4 x float> %152, <16 x i32> , !dbg !51\n %230 = shufflevector <4 x float> %155, <4 x float> poison, <16 x i32> , !dbg !51\n %231 = shufflevector <16 x float> %229, <16 x float> %230, <16 x i32> , !dbg !51\n %232 = shufflevector <4 x float> %158, <4 x float> poison, <16 x i32> , !dbg !51\n %233 = shufflevector <16 x float> %231, <16 x float> %232, <16 x i32> , !dbg !51\n %234 = shufflevector <16 x float> %228, <16 x float> poison, <16 x i32> , !dbg !51\n %235 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %233, <16 x float> %182, <16 x float> %234), !dbg !51\n %236 = shufflevector <4 x float> %149, <4 x float> %152, <16 x i32> , !dbg !51\n %237 = shufflevector <16 x float> %236, <16 x float> %230, <16 x i32> , !dbg !51\n %238 = shufflevector <16 x float> %237, <16 x float> %232, <16 x i32> , !dbg !51\n %239 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %238, <16 x float> %184, <16 x float> %235), !dbg !51\n %240 = shufflevector <4 x float> %149, <4 x float> %152, <16 x i32> , !dbg !51\n %241 = shufflevector <16 x float> %240, <16 x float> %230, <16 x i32> , !dbg !51\n %242 = shufflevector <16 x float> %241, <16 x float> %232, <16 x i32> , !dbg !51\n %243 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %242, <16 x float> %186, <16 x float> %239), !dbg !51\n %244 = shufflevector <4 x float> %149, <4 x float> %152, <16 x i32> , !dbg !51\n %245 = shufflevector <16 x float> %244, <16 x float> %230, <16 x i32> , !dbg !51\n %246 = shufflevector <16 x float> %245, <16 x float> %232, <16 x i32> , !dbg !51\n %247 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %246, <16 x float> %188, <16 x float> %243), !dbg !51\n %248 = shufflevector <4 x float> %150, <4 x float> %153, <16 x i32> , !dbg !51\n %249 = shufflevector <4 x float> %156, <4 x float> poison, <16 x i32> , !dbg !51\n %250 = shufflevector <16 x float> %248, <16 x float> %249, <16 x i32> , !dbg !51\n %251 = shufflevector <4 x float> %159, <4 x float> poison, <16 x i32> , !dbg !51\n %252 = shufflevector <16 x float> %250, <16 x float> %251, <16 x i32> , !dbg !51\n %253 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %252, <16 x float> %190, <16 x float> %247), !dbg !51\n %254 = shufflevector <4 x float> %150, <4 x float> %153, <16 x i32> , !dbg !51\n %255 = shufflevector <16 x float> %254, <16 x float> %249, <16 x i32> , !dbg !51\n %256 = shufflevector <16 x float> %255, <16 x float> %251, <16 x i32> , !dbg !51\n %257 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %256, <16 x float> %192, <16 x float> %253), !dbg !51\n %258 = shufflevector <4 x float> %150, <4 x float> %153, <16 x i32> , !dbg !51\n %259 = shufflevector <16 x float> %258, <16 x float> %249, <16 x i32> , !dbg !51\n %260 = shufflevector <16 x float> %259, <16 x float> %251, <16 x i32> , !dbg !51\n %261 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %260, <16 x float> %194, <16 x float> %257), !dbg !51\n %262 = shufflevector <4 x float> %150, <4 x float> %153, <16 x i32> , !dbg !51\n %263 = shufflevector <16 x float> %262, <16 x float> %249, <16 x i32> , !dbg !51\n %264 = shufflevector <16 x float> %263, <16 x float> %251, <16 x i32> , !dbg !51\n %265 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %264, <16 x float> %196, <16 x float> %261), !dbg !51\n %266 = shufflevector <2 x float> %151, <2 x float> %154, <16 x i32> , !dbg !51\n %267 = shufflevector <2 x float> %157, <2 x float> poison, <16 x i32> , !dbg !51\n %268 = shufflevector <16 x float> %266, <16 x float> %267, <16 x i32> , !dbg !51\n %269 = shufflevector <2 x float> %160, <2 x float> poison, <16 x i32> , !dbg !51\n %270 = shufflevector <16 x float> %268, <16 x float> %269, <16 x i32> , !dbg !51\n %271 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %270, <16 x float> %198, <16 x float> %265), !dbg !51\n %272 = shufflevector <2 x float> %151, <2 x float> %154, <16 x i32> , !dbg !51\n %273 = shufflevector <16 x float> %272, <16 x float> %267, <16 x i32> , !dbg !51\n %274 = shufflevector <16 x float> %273, <16 x float> %269, <16 x i32> , !dbg !51\n %275 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %274, <16 x float> %200, <16 x float> %271), !dbg !51\n %276 = getelementptr i8, ptr addrspace(1) %.pn15204, i64 32, !dbg !52\n %277 = getelementptr half, ptr addrspace(1) %.pn47205, i64 %88, !dbg !53\n %278 = getelementptr half, ptr addrspace(1) %.pn31206, i64 %88, !dbg !53\n %279 = add nuw nsw i32 %102, 1, !dbg !33\n %exitcond.not = icmp eq i32 %279, %35, !dbg !33\n br i1 %exitcond.not, label %._crit_edge.loopexit, label %101, !dbg !33\n\n._crit_edge.loopexit: ; preds = %101\n %280 = fptrunc <16 x float> %275 to <16 x half>, !dbg !54\n br label %._crit_edge, !dbg !40\n\n._crit_edge: ; preds = %._crit_edge.loopexit, %.._crit_edge_crit_edge\n %.pre-phi = phi i32 [ %.pre, %.._crit_edge_crit_edge ], [ %70, %._crit_edge.loopexit ], !dbg !34\n %281 = phi <16 x half> [ zeroinitializer, %.._crit_edge_crit_edge ], [ %280, %._crit_edge.loopexit ]\n %282 = or disjoint i32 %23, %27, !dbg !40\n %283 = or disjoint i32 %23, %26, !dbg !40\n %284 = mul i32 %283, %8, !dbg !55\n %285 = mul i32 %282, %8, !dbg !55\n %286 = sext i32 %284 to i64, !dbg !56\n %287 = getelementptr half, ptr addrspace(1) %2, i64 %286, !dbg !56\n %288 = sext i32 %285 to i64, !dbg !56\n %289 = getelementptr half, ptr addrspace(1) %2, i64 %288, !dbg !56\n %290 = sext i32 %31 to i64, !dbg !57\n %291 = getelementptr half, ptr addrspace(1) %287, i64 %290, !dbg !57\n %292 = getelementptr half, ptr addrspace(1) %289, i64 %290, !dbg !57\n %293 = icmp slt i32 %283, %3, !dbg !58\n %294 = icmp slt i32 %282, %3, !dbg !58\n %295 = icmp slt i32 %31, %4, !dbg !59\n %296 = and i1 %293, %295, !dbg !60\n %297 = and i1 %294, %295, !dbg !60\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !34\n %298 = and i32 %.pre-phi, 96, !dbg !34\n %299 = and i32 %32, 136, !dbg !34\n %300 = or disjoint i32 %298, %299, !dbg !34\n %301 = shl nuw nsw i32 %24, 1, !dbg !34\n %302 = and i32 %301, 16, !dbg !34\n %303 = or disjoint i32 %300, %302, !dbg !34\n %304 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %303, !dbg !34\n %305 = shufflevector <16 x half> %281, <16 x half> poison, <4 x i32> , !dbg !34\n store <4 x half> %305, ptr addrspace(3) %304, align 8, !dbg !34\n %306 = xor i32 %303, 272, !dbg !34\n %307 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %306, !dbg !34\n %308 = shufflevector <16 x half> %281, <16 x half> poison, <4 x i32> , !dbg !34\n store <4 x half> %308, ptr addrspace(3) %307, align 8, !dbg !34\n %309 = xor i32 %303, 520, !dbg !34\n %310 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %309, !dbg !34\n %311 = shufflevector <16 x half> %281, <16 x half> poison, <4 x i32> , !dbg !34\n store <4 x half> %311, ptr addrspace(3) %310, align 8, !dbg !34\n %312 = xor i32 %303, 792, !dbg !34\n %313 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %312, !dbg !34\n %314 = shufflevector <16 x half> %281, <16 x half> poison, <4 x i32> , !dbg !34\n store <4 x half> %314, ptr addrspace(3) %313, align 8, !dbg !34\n tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !34\n %315 = shl nuw nsw i32 %24, 6, !dbg !34\n %316 = and i32 %315, 768, !dbg !34\n %317 = shl nuw nsw i32 %29, 5, !dbg !34\n %318 = or disjoint i32 %316, %317, !dbg !34\n %319 = shl nuw nsw i32 %24, 2, !dbg !34\n %320 = and i32 %319, 16, !dbg !34\n %321 = or disjoint i32 %318, %320, !dbg !34\n %322 = and i32 %24, 24, !dbg !34\n %323 = xor i32 %321, %322, !dbg !34\n %324 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %323, !dbg !34\n %325 = getelementptr inbounds nuw i8, ptr addrspace(3) %324, i32 128, !dbg !34\n %326 = load <2 x i32>, ptr addrspace(3) %325, align 8, !dbg !34\n %327 = xor i32 %323, 8, !dbg !34\n %328 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %327, !dbg !34\n %329 = getelementptr inbounds nuw i8, ptr addrspace(3) %328, i32 128, !dbg !34\n %330 = load <2 x i32>, ptr addrspace(3) %329, align 8, !dbg !34\n %.uncasted.extract = load i32, ptr addrspace(3) %324, align 8, !dbg !34\n %331 = getelementptr inbounds nuw i8, ptr addrspace(3) %324, i32 4, !dbg !34\n %.uncasted.extract64 = load i32, ptr addrspace(3) %331, align 4, !dbg !34\n %.uncasted.extract66 = load i32, ptr addrspace(3) %328, align 8, !dbg !34\n %332 = getelementptr inbounds nuw i8, ptr addrspace(3) %328, i32 4, !dbg !34\n %.uncasted.extract68 = load i32, ptr addrspace(3) %332, align 4, !dbg !34\n tail call void asm sideeffect \"@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };\", \"r,r,r,r,l,b\"(i32 %.uncasted.extract, i32 %.uncasted.extract64, i32 %.uncasted.extract66, i32 %.uncasted.extract68, ptr addrspace(1) %291, i1 %296) #4, !dbg !34\n %.uncasted.extract70 = extractelement <2 x i32> %326, i64 0, !dbg !34\n %.uncasted.extract72 = extractelement <2 x i32> %326, i64 1, !dbg !34\n %.uncasted.extract74 = extractelement <2 x i32> %330, i64 0, !dbg !34\n %.uncasted.extract76 = extractelement <2 x i32> %330, i64 1, !dbg !34\n tail call void asm sideeffect \"@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };\", \"r,r,r,r,l,b\"(i32 %.uncasted.extract70, i32 %.uncasted.extract72, i32 %.uncasted.extract74, i32 %.uncasted.extract76, ptr addrspace(1) %292, i1 %297) #4, !dbg !34\n ret void, !dbg !61\n}\n\n; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1\n\n; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare i32 @llvm.smin.i32(i32, i32) #1\n\n; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1\n\n; Function Attrs: convergent nocallback nounwind\ndeclare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2\n\n; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>) #3\n\nattributes #0 = { \"nvvm.reqntid\"=\"32\" }\nattributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }\nattributes #2 = { convergent nocallback nounwind }\nattributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }\nattributes #4 = { nounwind }\n\n!llvm.dbg.cu = !{!0}\n!llvm.module.flags = !{!2, !3}\n!llvm.ident = !{!4}\n\n!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: \"triton\", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)\n!1 = !DIFile(filename: \"test_complex_kernels.py\", directory: \"/scratch/findhao/tritonparse/tests\")\n!2 = !{i32 2, !\"Debug Info Version\", i32 3}\n!3 = !{i32 4, !\"nvvm-reflect-ftz\", i32 1}\n!4 = !{!\"clang version 3.8.0 (tags/RELEASE_380/final)\"}\n!5 = distinct !DISubprogram(name: \"matmul_kernel\", linkageName: \"matmul_kernel\", scope: !1, file: !1, line: 38, type: !6, scopeLine: 38, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)\n!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)\n!7 = !{}\n!8 = !DILocation(line: 47, column: 24, scope: !5)\n!9 = !DILocation(line: 40, column: 22, scope: !10, inlinedAt: !12)\n!10 = distinct !DILexicalBlockFile(scope: !5, file: !11, discriminator: 0)\n!11 = !DIFile(filename: \"standard.py\", directory: \"/scratch/findhao/pta/triton/python/triton/language\")\n!12 = !DILocation(line: 48, column: 27, scope: !5)\n!13 = !DILocation(line: 40, column: 28, scope: !10, inlinedAt: !12)\n!14 = !DILocation(line: 40, column: 22, scope: !10, inlinedAt: !15)\n!15 = !DILocation(line: 49, column: 27, scope: !5)\n!16 = !DILocation(line: 40, column: 28, scope: !10, inlinedAt: !15)\n!17 = !DILocation(line: 51, column: 22, scope: !5)\n!18 = !DILocation(line: 53, column: 33, scope: !5)\n!19 = !DILocation(line: 53, column: 46, scope: !5)\n!20 = !DILocation(line: 54, column: 33, scope: !5)\n!21 = !DILocation(line: 54, column: 27, scope: !5)\n!22 = !DILocation(line: 55, column: 19, scope: !5)\n!23 = !DILocation(line: 55, column: 40, scope: !5)\n!24 = !DILocation(line: 57, column: 23, scope: !5)\n!25 = !DILocation(line: 57, column: 51, scope: !5)\n!26 = !DILocation(line: 58, column: 23, scope: !5)\n!27 = !DILocation(line: 58, column: 51, scope: !5)\n!28 = !DILocation(line: 58, column: 38, scope: !5)\n!29 = !DILocation(line: 60, column: 60, scope: !5)\n!30 = !DILocation(line: 40, column: 22, scope: !10, inlinedAt: !31)\n!31 = !DILocation(line: 64, column: 33, scope: !5)\n!32 = !DILocation(line: 40, column: 28, scope: !10, inlinedAt: !31)\n!33 = !DILocation(line: 64, column: 22, scope: !5)\n!34 = !DILocation(line: 76, column: 21, scope: !5)\n!35 = !DILocation(line: 69, column: 33, scope: !5)\n!36 = !DILocation(line: 58, column: 68, scope: !5)\n!37 = !DILocation(line: 61, column: 40, scope: !5)\n!38 = !DILocation(line: 61, column: 52, scope: !5)\n!39 = !DILocation(line: 61, column: 22, scope: !5)\n!40 = !DILocation(line: 57, column: 38, scope: !5)\n!41 = !DILocation(line: 57, column: 68, scope: !5)\n!42 = !DILocation(line: 60, column: 41, scope: !5)\n!43 = !DILocation(line: 60, column: 53, scope: !5)\n!44 = !DILocation(line: 60, column: 22, scope: !5)\n!45 = !DILocation(line: 65, column: 59, scope: !5)\n!46 = !DILocation(line: 65, column: 55, scope: !5)\n!47 = !DILocation(line: 65, column: 51, scope: !5)\n!48 = !DILocation(line: 65, column: 20, scope: !5)\n!49 = !DILocation(line: 66, column: 51, scope: !5)\n!50 = !DILocation(line: 66, column: 20, scope: !5)\n!51 = !DILocation(line: 67, column: 33, scope: !5)\n!52 = !DILocation(line: 68, column: 18, scope: !5)\n!53 = !DILocation(line: 69, column: 18, scope: !5)\n!54 = !DILocation(line: 70, column: 23, scope: !5)\n!55 = !DILocation(line: 74, column: 33, scope: !5)\n!56 = !DILocation(line: 74, column: 21, scope: !5)\n!57 = !DILocation(line: 74, column: 52, scope: !5)\n!58 = !DILocation(line: 75, column: 33, scope: !5)\n!59 = !DILocation(line: 75, column: 58, scope: !5)\n!60 = !DILocation(line: 75, column: 39, scope: !5)\n!61 = !DILocation(line: 76, column: 4, scope: !5)\n","matmul_kernel.ptx":"//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 8.7\n.target sm_75\n.address_size 64\n\n\t// .globl\tmatmul_kernel // -- Begin function matmul_kernel\n.extern .shared .align 16 .b8 global_smem[];\n // @matmul_kernel\n.visible .entry matmul_kernel(\n\t.param .u64 .ptr .global .align 1 matmul_kernel_param_0,\n\t.param .u64 .ptr .global .align 1 matmul_kernel_param_1,\n\t.param .u64 .ptr .global .align 1 matmul_kernel_param_2,\n\t.param .u32 matmul_kernel_param_3,\n\t.param .u32 matmul_kernel_param_4,\n\t.param .u32 matmul_kernel_param_5,\n\t.param .u32 matmul_kernel_param_6,\n\t.param .u32 matmul_kernel_param_7,\n\t.param .u32 matmul_kernel_param_8,\n\t.param .u64 .ptr .global .align 1 matmul_kernel_param_9\n)\n.reqntid 32\n{\n\t.reg .pred \t%p<11>;\n\t.reg .b16 \t%rs<57>;\n\t.reg .b32 \t%r<620>;\n\t.reg .b64 \t%rd<25>;\n\t.loc\t1 38 0 // test_complex_kernels.py:38:0\n$L__func_begin0:\n\t.loc\t1 38 0 // test_complex_kernels.py:38:0\n\n// %bb.0:\n\tld.param.b32 \t%r73, [matmul_kernel_param_8];\n\tld.param.b32 \t%r593, [matmul_kernel_param_5];\n\tld.param.b32 \t%r69, [matmul_kernel_param_4];\n\tld.param.b32 \t%r68, [matmul_kernel_param_3];\n\tld.param.b64 \t%rd11, [matmul_kernel_param_2];\n$L__tmp0:\n\t.loc\t1 47 24 // test_complex_kernels.py:47:24\n\tmov.u32 \t%r74, %ctaid.x;\n$L__tmp1:\n\t.loc\t2 40 22 // standard.py:40:22 @[ test_complex_kernels.py:48:27 ]\n\tadd.s32 \t%r75, %r68, 15;\n\t.loc\t2 40 28 // standard.py:40:28 @[ test_complex_kernels.py:48:27 ]\n\tshr.s32 \t%r76, %r75, 31;\n\tshr.u32 \t%r77, %r76, 28;\n\tadd.s32 \t%r78, %r75, %r77;\n\tshr.s32 \t%r79, %r78, 4;\n$L__tmp2:\n\t.loc\t2 40 22 // standard.py:40:22 @[ test_complex_kernels.py:49:27 ]\n\tadd.s32 \t%r80, %r69, 31;\n\t.loc\t2 40 28 // standard.py:40:28 @[ test_complex_kernels.py:49:27 ]\n\tshr.s32 \t%r81, %r80, 31;\n\tshr.u32 \t%r82, %r81, 27;\n\tadd.s32 \t%r83, %r80, %r82;\n\tshr.s32 \t%r84, %r83, 5;\n$L__tmp3:\n\t.loc\t1 51 22 // test_complex_kernels.py:51:22\n\tdiv.s32 \t%r86, %r74, %r84;\n\t.loc\t1 53 33 // test_complex_kernels.py:53:33\n\tsub.s32 \t%r87, %r79, %r86;\n\t.loc\t1 53 46 // test_complex_kernels.py:53:46\n\tmin.s32 \t%r88, %r87, 1;\n\t.loc\t1 54 33 // test_complex_kernels.py:54:33\n\trem.s32 \t%r89, %r74, %r88;\n\t.loc\t1 54 27 // test_complex_kernels.py:54:27\n\tadd.s32 \t%r90, %r89, %r86;\n\t.loc\t1 55 19 // test_complex_kernels.py:55:19\n\tmul.lo.s32 \t%r91, %r86, %r84;\n\tsub.s32 \t%r92, %r74, %r91;\n\t.loc\t1 55 40 // test_complex_kernels.py:55:40\n\tdiv.s32 \t%r93, %r92, %r88;\n\t.loc\t1 57 23 // test_complex_kernels.py:57:23\n\tshl.b32 \t%r1, %r90, 4;\n\t.loc\t1 57 51 // test_complex_kernels.py:57:51\n\tmov.u32 \t%r2, %tid.x;\n\tbfe.u32 \t%r3, %r2, 2, 3;\n\tor.b32 \t%r4, %r3, 8;\n\t.loc\t1 58 23 // test_complex_kernels.py:58:23\n\tshl.b32 \t%r94, %r93, 5;\n\t.loc\t1 58 51 // test_complex_kernels.py:58:51\n\tand.b32 \t%r5, %r2, 3;\n\tshl.b32 \t%r95, %r5, 3;\n\t.loc\t1 58 38 // test_complex_kernels.py:58:38\n\tor.b32 \t%r6, %r94, %r95;\n\t.loc\t1 60 60 // test_complex_kernels.py:60:60\n\tshl.b32 \t%r7, %r2, 3;\n$L__tmp4:\n\t.loc\t2 40 22 // standard.py:40:22 @[ test_complex_kernels.py:64:33 ]\n\tadd.s32 \t%r96, %r593, 15;\n$L__tmp5:\n\t.loc\t1 64 22 // test_complex_kernels.py:64:22\n\tsetp.gt.s32 \t%p1, %r96, 15;\n\tmov.b32 \t%r592, global_smem;\n\t@%p1 bra \t$L__BB0_2;\n\tbra.uni \t$L__BB0_1;\n$L__BB0_2: // %.lr.ph\n\t.loc\t1 0 22 // test_complex_kernels.py:0:22\n\tld.param.b32 \t%r72, [matmul_kernel_param_7];\n\tld.param.b32 \t%r71, [matmul_kernel_param_6];\n\tld.param.b64 \t%rd23, [matmul_kernel_param_1];\n\tld.param.b64 \t%rd9, [matmul_kernel_param_0];\n\tand.b32 \t%r8, %r7, 8;\n\tshr.s32 \t%r97, %r96, 31;\n\tshr.u32 \t%r98, %r97, 28;\n\tadd.s32 \t%r99, %r96, %r98;\n\tshr.s32 \t%r594, %r99, 4;\n\t.loc\t1 69 33 // test_complex_kernels.py:69:33\n\tshl.b32 \t%r124, %r72, 4;\n\t.loc\t1 58 68 // test_complex_kernels.py:58:68\n\trem.s32 \t%r125, %r6, %r69;\n\t.loc\t1 61 52 // test_complex_kernels.py:61:52\n\tmad.lo.s32 \t%r126, %r72, %r3, %r125;\n\t.loc\t1 57 51 // test_complex_kernels.py:57:51\n\tbfe.u32 \t%r127, %r2, 1, 4;\n\t.loc\t1 57 38 // test_complex_kernels.py:57:38\n\tor.b32 \t%r128, %r1, %r127;\n\t.loc\t1 57 68 // test_complex_kernels.py:57:68\n\trem.s32 \t%r129, %r128, %r68;\n\t.loc\t1 60 53 // test_complex_kernels.py:60:53\n\tmad.lo.s32 \t%r130, %r129, %r71, %r8;\n\t.loc\t1 60 22 // test_complex_kernels.py:60:22\n\tmul.wide.s32 \t%rd12, %r130, 2;\n\tadd.s64 \t%rd24, %rd9, %rd12;\n\tshl.b32 \t%r131, %r2, 5;\n\tand.b32 \t%r132, %r131, 992;\n\tadd.s32 \t%r11, %r592, %r132;\n\tand.b32 \t%r134, %r131, 768;\n\tadd.s32 \t%r12, %r592, %r134;\n\tshl.b32 \t%r611, %r2, 4;\n\tand.b32 \t%r135, %r611, 112;\n\tadd.s32 \t%r14, %r592, %r135;\n\t.loc\t1 64 22 // test_complex_kernels.py:64:22\n\tmad.lo.s32 \t%r137, %r72, %r4, %r125;\n\tmul.wide.s32 \t%rd2, %r137, 2;\n\tmul.wide.s32 \t%rd3, %r124, 2;\n\tmul.wide.s32 \t%rd4, %r126, 2;\n\tmov.b32 \t%r595, 0f00000000;\n\tmov.b32 \t%r596, %r595;\n\tmov.b32 \t%r597, %r595;\n\tmov.b32 \t%r598, %r595;\n\tmov.b32 \t%r599, %r595;\n\tmov.b32 \t%r600, %r595;\n\tmov.b32 \t%r601, %r595;\n\tmov.b32 \t%r602, %r595;\n\tmov.b32 \t%r603, %r595;\n\tmov.b32 \t%r604, %r595;\n\tmov.b32 \t%r605, %r595;\n\tmov.b32 \t%r606, %r595;\n\tmov.b32 \t%r607, %r595;\n\tmov.b32 \t%r608, %r595;\n\tmov.b32 \t%r609, %r595;\n\tmov.b32 \t%r610, %r595;\n$L__BB0_3: // =>This Inner Loop Header: Depth=1\n\t.loc\t1 0 22 // test_complex_kernels.py:0:22\n\tadd.s64 \t%rd15, %rd23, %rd2;\n\tadd.s64 \t%rd14, %rd23, %rd4;\n\t.loc\t1 65 51 // test_complex_kernels.py:65:51\n\tsetp.lt.s32 \t%p2, %r8, %r593;\n\tmov.b32 \t%r142, 0;\n\t.loc\t1 65 20 // test_complex_kernels.py:65:20\n\t// begin inline asm\n\tmov.u32 %r138, %r142;\n\tmov.u32 %r139, %r142;\n\tmov.u32 %r140, %r142;\n\tmov.u32 %r141, %r142;\n\t@%p2 ld.global.v4.b32 { %r138, %r139, %r140, %r141 }, [ %rd24 + 0 ];\n\t// end inline asm\n\t.loc\t1 66 51 // test_complex_kernels.py:66:51\n\tsetp.lt.s32 \t%p3, %r3, %r593;\n\tsetp.lt.s32 \t%p4, %r4, %r593;\n\t.loc\t1 66 20 // test_complex_kernels.py:66:20\n\t// begin inline asm\n\tmov.u32 %r146, %r142;\n\tmov.u32 %r147, %r142;\n\tmov.u32 %r148, %r142;\n\tmov.u32 %r149, %r142;\n\t@%p3 ld.global.v4.b32 { %r146, %r147, %r148, %r149 }, [ %rd14 + 0 ];\n\t// end inline asm\n\t// begin inline asm\n\tmov.u32 %r154, %r142;\n\tmov.u32 %r155, %r142;\n\tmov.u32 %r156, %r142;\n\tmov.u32 %r157, %r142;\n\t@%p4 ld.global.v4.b32 { %r154, %r155, %r156, %r157 }, [ %rd15 + 0 ];\n\t// end inline asm\n\t.loc\t1 67 33 // test_complex_kernels.py:67:33\n\tmov.b32 \t{%rs1, %rs2}, %r139;\n\tcvt.f32.f16 \t%r162, %rs2;\n\tcvt.f32.f16 \t%r163, %rs1;\n\tmov.b32 \t{%rs3, %rs4}, %r138;\n\tcvt.f32.f16 \t%r164, %rs4;\n\tcvt.f32.f16 \t%r165, %rs3;\n\tmov.b32 \t{%rs5, %rs6}, %r141;\n\tcvt.f32.f16 \t%r166, %rs6;\n\tcvt.f32.f16 \t%r167, %rs5;\n\tmov.b32 \t{%rs7, %rs8}, %r140;\n\tcvt.f32.f16 \t%r168, %rs8;\n\tcvt.f32.f16 \t%r169, %rs7;\n\tbar.sync \t0;\n\tst.shared.v4.b32 \t[%r11], {%r165, %r164, %r163, %r162};\n\tst.shared.v4.b32 \t[%r11+16], {%r169, %r168, %r167, %r166};\n\tbar.sync \t0;\n\tmov.b32 \t{%rs9, %rs10}, %r147;\n\tcvt.f32.f16 \t%r170, %rs10;\n\tcvt.f32.f16 \t%r171, %rs9;\n\tmov.b32 \t{%rs11, %rs12}, %r146;\n\tcvt.f32.f16 \t%r172, %rs12;\n\tcvt.f32.f16 \t%r173, %rs11;\n\tmov.b32 \t{%rs13, %rs14}, %r149;\n\tcvt.f32.f16 \t%r174, %rs14;\n\tcvt.f32.f16 \t%r175, %rs13;\n\tmov.b32 \t{%rs15, %rs16}, %r148;\n\tcvt.f32.f16 \t%r176, %rs16;\n\tcvt.f32.f16 \t%r177, %rs15;\n\tmov.b32 \t{%rs17, %rs18}, %r155;\n\tcvt.f32.f16 \t%r178, %rs18;\n\tcvt.f32.f16 \t%r179, %rs17;\n\tmov.b32 \t{%rs19, %rs20}, %r154;\n\tcvt.f32.f16 \t%r180, %rs20;\n\tcvt.f32.f16 \t%r181, %rs19;\n\tmov.b32 \t{%rs21, %rs22}, %r157;\n\tcvt.f32.f16 \t%r182, %rs22;\n\tcvt.f32.f16 \t%r183, %rs21;\n\tmov.b32 \t{%rs23, %rs24}, %r156;\n\tcvt.f32.f16 \t%r184, %rs24;\n\tcvt.f32.f16 \t%r185, %rs23;\n\tld.shared.v2.b32 \t{%r186, %r187}, [%r12+32];\n\tld.shared.v2.b32 \t{%r188, %r189}, [%r12+24];\n\tld.shared.v2.b32 \t{%r190, %r191}, [%r12+48];\n\tld.shared.v2.b32 \t{%r192, %r193}, [%r12+40];\n\tld.shared.v2.b32 \t{%r194, %r195}, [%r12+56];\n\tld.shared.v2.b32 \t{%r196, %r197}, [%r12+96];\n\tld.shared.v2.b32 \t{%r198, %r199}, [%r12+88];\n\tld.shared.v2.b32 \t{%r200, %r201}, [%r12+112];\n\tld.shared.v2.b32 \t{%r202, %r203}, [%r12+104];\n\tld.shared.v2.b32 \t{%r204, %r205}, [%r12+120];\n\tld.shared.v2.b32 \t{%r206, %r207}, [%r12+160];\n\tld.shared.v2.b32 \t{%r208, %r209}, [%r12+152];\n\tld.shared.v2.b32 \t{%r210, %r211}, [%r12+176];\n\tld.shared.v2.b32 \t{%r212, %r213}, [%r12+168];\n\tld.shared.v2.b32 \t{%r214, %r215}, [%r12+184];\n\tld.shared.v2.b32 \t{%r216, %r217}, [%r12+224];\n\tld.shared.v2.b32 \t{%r218, %r219}, [%r12+216];\n\tld.shared.v2.b32 \t{%r220, %r221}, [%r12+240];\n\tld.shared.v2.b32 \t{%r222, %r223}, [%r12+232];\n\tld.shared.v2.b32 \t{%r224, %r225}, [%r12+248];\n\tld.shared.v4.b32 \t{%r226, %r227, %r228, %r229}, [%r12];\n\tld.shared.v2.b32 \t{%r230, %r231}, [%r12+16];\n\tld.shared.v4.b32 \t{%r232, %r233, %r234, %r235}, [%r12+64];\n\tld.shared.v2.b32 \t{%r236, %r237}, [%r12+80];\n\tld.shared.v4.b32 \t{%r238, %r239, %r240, %r241}, [%r12+128];\n\tld.shared.v2.b32 \t{%r242, %r243}, [%r12+144];\n\tld.shared.v4.b32 \t{%r244, %r245, %r246, %r247}, [%r12+192];\n\tld.shared.v2.b32 \t{%r248, %r249}, [%r12+208];\n\tbar.sync \t0;\n\tst.shared.v4.b32 \t[%r11], {%r173, %r172, %r171, %r170};\n\tst.shared.v4.b32 \t[%r11+16], {%r177, %r176, %r175, %r174};\n\tst.shared.v4.b32 \t[%r11+1024], {%r181, %r180, %r179, %r178};\n\tst.shared.v4.b32 \t[%r11+1040], {%r185, %r184, %r183, %r182};\n\tbar.sync \t0;\n\tld.shared.v4.b32 \t{%r250, %r251, %r252, %r253}, [%r14];\n\tld.shared.v4.b32 \t{%r254, %r255, %r256, %r257}, [%r14+128];\n\tld.shared.v4.b32 \t{%r258, %r259, %r260, %r261}, [%r14+256];\n\tld.shared.v4.b32 \t{%r262, %r263, %r264, %r265}, [%r14+384];\n\tld.shared.v4.b32 \t{%r266, %r267, %r268, %r269}, [%r14+512];\n\tld.shared.v4.b32 \t{%r270, %r271, %r272, %r273}, [%r14+640];\n\tld.shared.v4.b32 \t{%r274, %r275, %r276, %r277}, [%r14+768];\n\tld.shared.v4.b32 \t{%r278, %r279, %r280, %r281}, [%r14+896];\n\tld.shared.v4.b32 \t{%r282, %r283, %r284, %r285}, [%r14+1024];\n\tld.shared.v4.b32 \t{%r286, %r287, %r288, %r289}, [%r14+1152];\n\tld.shared.v4.b32 \t{%r290, %r291, %r292, %r293}, [%r14+1280];\n\tld.shared.v4.b32 \t{%r294, %r295, %r296, %r297}, [%r14+1408];\n\tld.shared.v4.b32 \t{%r298, %r299, %r300, %r301}, [%r14+1536];\n\tld.shared.v4.b32 \t{%r302, %r303, %r304, %r305}, [%r14+1664];\n\tld.shared.v4.b32 \t{%r306, %r307, %r308, %r309}, [%r14+1792];\n\tld.shared.v4.b32 \t{%r310, %r311, %r312, %r313}, [%r14+1920];\n\tfma.rn.f32 \t%r314, %r244, %r253, %r610;\n\tfma.rn.f32 \t%r315, %r238, %r253, %r609;\n\tfma.rn.f32 \t%r316, %r232, %r253, %r608;\n\tfma.rn.f32 \t%r317, %r226, %r250, %r595;\n\tfma.rn.f32 \t%r318, %r232, %r250, %r596;\n\tfma.rn.f32 \t%r319, %r238, %r250, %r597;\n\tfma.rn.f32 \t%r320, %r244, %r250, %r598;\n\tfma.rn.f32 \t%r321, %r226, %r251, %r599;\n\tfma.rn.f32 \t%r322, %r232, %r251, %r600;\n\tfma.rn.f32 \t%r323, %r238, %r251, %r601;\n\tfma.rn.f32 \t%r324, %r244, %r251, %r602;\n\tfma.rn.f32 \t%r325, %r226, %r252, %r603;\n\tfma.rn.f32 \t%r326, %r232, %r252, %r604;\n\tfma.rn.f32 \t%r327, %r238, %r252, %r605;\n\tfma.rn.f32 \t%r328, %r244, %r252, %r606;\n\tfma.rn.f32 \t%r329, %r226, %r253, %r607;\n\tfma.rn.f32 \t%r330, %r227, %r257, %r329;\n\tfma.rn.f32 \t%r331, %r245, %r256, %r328;\n\tfma.rn.f32 \t%r332, %r239, %r256, %r327;\n\tfma.rn.f32 \t%r333, %r233, %r256, %r326;\n\tfma.rn.f32 \t%r334, %r227, %r256, %r325;\n\tfma.rn.f32 \t%r335, %r245, %r255, %r324;\n\tfma.rn.f32 \t%r336, %r239, %r255, %r323;\n\tfma.rn.f32 \t%r337, %r233, %r255, %r322;\n\tfma.rn.f32 \t%r338, %r227, %r255, %r321;\n\tfma.rn.f32 \t%r339, %r245, %r254, %r320;\n\tfma.rn.f32 \t%r340, %r239, %r254, %r319;\n\tfma.rn.f32 \t%r341, %r233, %r254, %r318;\n\tfma.rn.f32 \t%r342, %r227, %r254, %r317;\n\tfma.rn.f32 \t%r343, %r233, %r257, %r316;\n\tfma.rn.f32 \t%r344, %r239, %r257, %r315;\n\tfma.rn.f32 \t%r345, %r245, %r257, %r314;\n\tfma.rn.f32 \t%r346, %r246, %r261, %r345;\n\tfma.rn.f32 \t%r347, %r240, %r261, %r344;\n\tfma.rn.f32 \t%r348, %r234, %r261, %r343;\n\tfma.rn.f32 \t%r349, %r228, %r258, %r342;\n\tfma.rn.f32 \t%r350, %r234, %r258, %r341;\n\tfma.rn.f32 \t%r351, %r240, %r258, %r340;\n\tfma.rn.f32 \t%r352, %r246, %r258, %r339;\n\tfma.rn.f32 \t%r353, %r228, %r259, %r338;\n\tfma.rn.f32 \t%r354, %r234, %r259, %r337;\n\tfma.rn.f32 \t%r355, %r240, %r259, %r336;\n\tfma.rn.f32 \t%r356, %r246, %r259, %r335;\n\tfma.rn.f32 \t%r357, %r228, %r260, %r334;\n\tfma.rn.f32 \t%r358, %r234, %r260, %r333;\n\tfma.rn.f32 \t%r359, %r240, %r260, %r332;\n\tfma.rn.f32 \t%r360, %r246, %r260, %r331;\n\tfma.rn.f32 \t%r361, %r228, %r261, %r330;\n\tfma.rn.f32 \t%r362, %r229, %r265, %r361;\n\tfma.rn.f32 \t%r363, %r247, %r264, %r360;\n\tfma.rn.f32 \t%r364, %r241, %r264, %r359;\n\tfma.rn.f32 \t%r365, %r235, %r264, %r358;\n\tfma.rn.f32 \t%r366, %r229, %r264, %r357;\n\tfma.rn.f32 \t%r367, %r247, %r263, %r356;\n\tfma.rn.f32 \t%r368, %r241, %r263, %r355;\n\tfma.rn.f32 \t%r369, %r235, %r263, %r354;\n\tfma.rn.f32 \t%r370, %r229, %r263, %r353;\n\tfma.rn.f32 \t%r371, %r247, %r262, %r352;\n\tfma.rn.f32 \t%r372, %r241, %r262, %r351;\n\tfma.rn.f32 \t%r373, %r235, %r262, %r350;\n\tfma.rn.f32 \t%r374, %r229, %r262, %r349;\n\tfma.rn.f32 \t%r375, %r235, %r265, %r348;\n\tfma.rn.f32 \t%r376, %r241, %r265, %r347;\n\tfma.rn.f32 \t%r377, %r247, %r265, %r346;\n\tfma.rn.f32 \t%r378, %r248, %r269, %r377;\n\tfma.rn.f32 \t%r379, %r242, %r269, %r376;\n\tfma.rn.f32 \t%r380, %r236, %r269, %r375;\n\tfma.rn.f32 \t%r381, %r230, %r266, %r374;\n\tfma.rn.f32 \t%r382, %r236, %r266, %r373;\n\tfma.rn.f32 \t%r383, %r242, %r266, %r372;\n\tfma.rn.f32 \t%r384, %r248, %r266, %r371;\n\tfma.rn.f32 \t%r385, %r230, %r267, %r370;\n\tfma.rn.f32 \t%r386, %r236, %r267, %r369;\n\tfma.rn.f32 \t%r387, %r242, %r267, %r368;\n\tfma.rn.f32 \t%r388, %r248, %r267, %r367;\n\tfma.rn.f32 \t%r389, %r230, %r268, %r366;\n\tfma.rn.f32 \t%r390, %r236, %r268, %r365;\n\tfma.rn.f32 \t%r391, %r242, %r268, %r364;\n\tfma.rn.f32 \t%r392, %r248, %r268, %r363;\n\tfma.rn.f32 \t%r393, %r230, %r269, %r362;\n\tfma.rn.f32 \t%r394, %r231, %r273, %r393;\n\tfma.rn.f32 \t%r395, %r249, %r272, %r392;\n\tfma.rn.f32 \t%r396, %r243, %r272, %r391;\n\tfma.rn.f32 \t%r397, %r237, %r272, %r390;\n\tfma.rn.f32 \t%r398, %r231, %r272, %r389;\n\tfma.rn.f32 \t%r399, %r249, %r271, %r388;\n\tfma.rn.f32 \t%r400, %r243, %r271, %r387;\n\tfma.rn.f32 \t%r401, %r237, %r271, %r386;\n\tfma.rn.f32 \t%r402, %r231, %r271, %r385;\n\tfma.rn.f32 \t%r403, %r249, %r270, %r384;\n\tfma.rn.f32 \t%r404, %r243, %r270, %r383;\n\tfma.rn.f32 \t%r405, %r237, %r270, %r382;\n\tfma.rn.f32 \t%r406, %r231, %r270, %r381;\n\tfma.rn.f32 \t%r407, %r237, %r273, %r380;\n\tfma.rn.f32 \t%r408, %r243, %r273, %r379;\n\tfma.rn.f32 \t%r409, %r249, %r273, %r378;\n\tfma.rn.f32 \t%r410, %r218, %r277, %r409;\n\tfma.rn.f32 \t%r411, %r208, %r277, %r408;\n\tfma.rn.f32 \t%r412, %r198, %r277, %r407;\n\tfma.rn.f32 \t%r413, %r188, %r274, %r406;\n\tfma.rn.f32 \t%r414, %r198, %r274, %r405;\n\tfma.rn.f32 \t%r415, %r208, %r274, %r404;\n\tfma.rn.f32 \t%r416, %r218, %r274, %r403;\n\tfma.rn.f32 \t%r417, %r188, %r275, %r402;\n\tfma.rn.f32 \t%r418, %r198, %r275, %r401;\n\tfma.rn.f32 \t%r419, %r208, %r275, %r400;\n\tfma.rn.f32 \t%r420, %r218, %r275, %r399;\n\tfma.rn.f32 \t%r421, %r188, %r276, %r398;\n\tfma.rn.f32 \t%r422, %r198, %r276, %r397;\n\tfma.rn.f32 \t%r423, %r208, %r276, %r396;\n\tfma.rn.f32 \t%r424, %r218, %r276, %r395;\n\tfma.rn.f32 \t%r425, %r188, %r277, %r394;\n\tfma.rn.f32 \t%r426, %r189, %r281, %r425;\n\tfma.rn.f32 \t%r427, %r219, %r280, %r424;\n\tfma.rn.f32 \t%r428, %r209, %r280, %r423;\n\tfma.rn.f32 \t%r429, %r199, %r280, %r422;\n\tfma.rn.f32 \t%r430, %r189, %r280, %r421;\n\tfma.rn.f32 \t%r431, %r219, %r279, %r420;\n\tfma.rn.f32 \t%r432, %r209, %r279, %r419;\n\tfma.rn.f32 \t%r433, %r199, %r279, %r418;\n\tfma.rn.f32 \t%r434, %r189, %r279, %r417;\n\tfma.rn.f32 \t%r435, %r219, %r278, %r416;\n\tfma.rn.f32 \t%r436, %r209, %r278, %r415;\n\tfma.rn.f32 \t%r437, %r199, %r278, %r414;\n\tfma.rn.f32 \t%r438, %r189, %r278, %r413;\n\tfma.rn.f32 \t%r439, %r199, %r281, %r412;\n\tfma.rn.f32 \t%r440, %r209, %r281, %r411;\n\tfma.rn.f32 \t%r441, %r219, %r281, %r410;\n\tfma.rn.f32 \t%r442, %r216, %r285, %r441;\n\tfma.rn.f32 \t%r443, %r206, %r285, %r440;\n\tfma.rn.f32 \t%r444, %r196, %r285, %r439;\n\tfma.rn.f32 \t%r445, %r186, %r282, %r438;\n\tfma.rn.f32 \t%r446, %r196, %r282, %r437;\n\tfma.rn.f32 \t%r447, %r206, %r282, %r436;\n\tfma.rn.f32 \t%r448, %r216, %r282, %r435;\n\tfma.rn.f32 \t%r449, %r186, %r283, %r434;\n\tfma.rn.f32 \t%r450, %r196, %r283, %r433;\n\tfma.rn.f32 \t%r451, %r206, %r283, %r432;\n\tfma.rn.f32 \t%r452, %r216, %r283, %r431;\n\tfma.rn.f32 \t%r453, %r186, %r284, %r430;\n\tfma.rn.f32 \t%r454, %r196, %r284, %r429;\n\tfma.rn.f32 \t%r455, %r206, %r284, %r428;\n\tfma.rn.f32 \t%r456, %r216, %r284, %r427;\n\tfma.rn.f32 \t%r457, %r186, %r285, %r426;\n\tfma.rn.f32 \t%r458, %r187, %r289, %r457;\n\tfma.rn.f32 \t%r459, %r217, %r288, %r456;\n\tfma.rn.f32 \t%r460, %r207, %r288, %r455;\n\tfma.rn.f32 \t%r461, %r197, %r288, %r454;\n\tfma.rn.f32 \t%r462, %r187, %r288, %r453;\n\tfma.rn.f32 \t%r463, %r217, %r287, %r452;\n\tfma.rn.f32 \t%r464, %r207, %r287, %r451;\n\tfma.rn.f32 \t%r465, %r197, %r287, %r450;\n\tfma.rn.f32 \t%r466, %r187, %r287, %r449;\n\tfma.rn.f32 \t%r467, %r217, %r286, %r448;\n\tfma.rn.f32 \t%r468, %r207, %r286, %r447;\n\tfma.rn.f32 \t%r469, %r197, %r286, %r446;\n\tfma.rn.f32 \t%r470, %r187, %r286, %r445;\n\tfma.rn.f32 \t%r471, %r197, %r289, %r444;\n\tfma.rn.f32 \t%r472, %r207, %r289, %r443;\n\tfma.rn.f32 \t%r473, %r217, %r289, %r442;\n\tfma.rn.f32 \t%r474, %r222, %r293, %r473;\n\tfma.rn.f32 \t%r475, %r212, %r293, %r472;\n\tfma.rn.f32 \t%r476, %r202, %r293, %r471;\n\tfma.rn.f32 \t%r477, %r192, %r290, %r470;\n\tfma.rn.f32 \t%r478, %r202, %r290, %r469;\n\tfma.rn.f32 \t%r479, %r212, %r290, %r468;\n\tfma.rn.f32 \t%r480, %r222, %r290, %r467;\n\tfma.rn.f32 \t%r481, %r192, %r291, %r466;\n\tfma.rn.f32 \t%r482, %r202, %r291, %r465;\n\tfma.rn.f32 \t%r483, %r212, %r291, %r464;\n\tfma.rn.f32 \t%r484, %r222, %r291, %r463;\n\tfma.rn.f32 \t%r485, %r192, %r292, %r462;\n\tfma.rn.f32 \t%r486, %r202, %r292, %r461;\n\tfma.rn.f32 \t%r487, %r212, %r292, %r460;\n\tfma.rn.f32 \t%r488, %r222, %r292, %r459;\n\tfma.rn.f32 \t%r489, %r192, %r293, %r458;\n\tfma.rn.f32 \t%r490, %r193, %r297, %r489;\n\tfma.rn.f32 \t%r491, %r223, %r296, %r488;\n\tfma.rn.f32 \t%r492, %r213, %r296, %r487;\n\tfma.rn.f32 \t%r493, %r203, %r296, %r486;\n\tfma.rn.f32 \t%r494, %r193, %r296, %r485;\n\tfma.rn.f32 \t%r495, %r223, %r295, %r484;\n\tfma.rn.f32 \t%r496, %r213, %r295, %r483;\n\tfma.rn.f32 \t%r497, %r203, %r295, %r482;\n\tfma.rn.f32 \t%r498, %r193, %r295, %r481;\n\tfma.rn.f32 \t%r499, %r223, %r294, %r480;\n\tfma.rn.f32 \t%r500, %r213, %r294, %r479;\n\tfma.rn.f32 \t%r501, %r203, %r294, %r478;\n\tfma.rn.f32 \t%r502, %r193, %r294, %r477;\n\tfma.rn.f32 \t%r503, %r203, %r297, %r476;\n\tfma.rn.f32 \t%r504, %r213, %r297, %r475;\n\tfma.rn.f32 \t%r505, %r223, %r297, %r474;\n\tfma.rn.f32 \t%r506, %r220, %r301, %r505;\n\tfma.rn.f32 \t%r507, %r210, %r301, %r504;\n\tfma.rn.f32 \t%r508, %r200, %r301, %r503;\n\tfma.rn.f32 \t%r509, %r190, %r298, %r502;\n\tfma.rn.f32 \t%r510, %r200, %r298, %r501;\n\tfma.rn.f32 \t%r511, %r210, %r298, %r500;\n\tfma.rn.f32 \t%r512, %r220, %r298, %r499;\n\tfma.rn.f32 \t%r513, %r190, %r299, %r498;\n\tfma.rn.f32 \t%r514, %r200, %r299, %r497;\n\tfma.rn.f32 \t%r515, %r210, %r299, %r496;\n\tfma.rn.f32 \t%r516, %r220, %r299, %r495;\n\tfma.rn.f32 \t%r517, %r190, %r300, %r494;\n\tfma.rn.f32 \t%r518, %r200, %r300, %r493;\n\tfma.rn.f32 \t%r519, %r210, %r300, %r492;\n\tfma.rn.f32 \t%r520, %r220, %r300, %r491;\n\tfma.rn.f32 \t%r521, %r190, %r301, %r490;\n\tfma.rn.f32 \t%r522, %r191, %r305, %r521;\n\tfma.rn.f32 \t%r523, %r221, %r304, %r520;\n\tfma.rn.f32 \t%r524, %r211, %r304, %r519;\n\tfma.rn.f32 \t%r525, %r201, %r304, %r518;\n\tfma.rn.f32 \t%r526, %r191, %r304, %r517;\n\tfma.rn.f32 \t%r527, %r221, %r303, %r516;\n\tfma.rn.f32 \t%r528, %r211, %r303, %r515;\n\tfma.rn.f32 \t%r529, %r201, %r303, %r514;\n\tfma.rn.f32 \t%r530, %r191, %r303, %r513;\n\tfma.rn.f32 \t%r531, %r221, %r302, %r512;\n\tfma.rn.f32 \t%r532, %r211, %r302, %r511;\n\tfma.rn.f32 \t%r533, %r201, %r302, %r510;\n\tfma.rn.f32 \t%r534, %r191, %r302, %r509;\n\tfma.rn.f32 \t%r535, %r201, %r305, %r508;\n\tfma.rn.f32 \t%r536, %r211, %r305, %r507;\n\tfma.rn.f32 \t%r537, %r221, %r305, %r506;\n\tfma.rn.f32 \t%r538, %r224, %r309, %r537;\n\tfma.rn.f32 \t%r539, %r214, %r309, %r536;\n\tfma.rn.f32 \t%r540, %r204, %r309, %r535;\n\tfma.rn.f32 \t%r541, %r194, %r306, %r534;\n\tfma.rn.f32 \t%r542, %r204, %r306, %r533;\n\tfma.rn.f32 \t%r543, %r214, %r306, %r532;\n\tfma.rn.f32 \t%r544, %r224, %r306, %r531;\n\tfma.rn.f32 \t%r545, %r194, %r307, %r530;\n\tfma.rn.f32 \t%r546, %r204, %r307, %r529;\n\tfma.rn.f32 \t%r547, %r214, %r307, %r528;\n\tfma.rn.f32 \t%r548, %r224, %r307, %r527;\n\tfma.rn.f32 \t%r549, %r194, %r308, %r526;\n\tfma.rn.f32 \t%r550, %r204, %r308, %r525;\n\tfma.rn.f32 \t%r551, %r214, %r308, %r524;\n\tfma.rn.f32 \t%r552, %r224, %r308, %r523;\n\tfma.rn.f32 \t%r553, %r194, %r309, %r522;\n\tfma.rn.f32 \t%r607, %r195, %r313, %r553;\n\tfma.rn.f32 \t%r606, %r225, %r312, %r552;\n\tfma.rn.f32 \t%r605, %r215, %r312, %r551;\n\tfma.rn.f32 \t%r604, %r205, %r312, %r550;\n\tfma.rn.f32 \t%r603, %r195, %r312, %r549;\n\tfma.rn.f32 \t%r602, %r225, %r311, %r548;\n\tfma.rn.f32 \t%r601, %r215, %r311, %r547;\n\tfma.rn.f32 \t%r600, %r205, %r311, %r546;\n\tfma.rn.f32 \t%r599, %r195, %r311, %r545;\n\tfma.rn.f32 \t%r598, %r225, %r310, %r544;\n\tfma.rn.f32 \t%r597, %r215, %r310, %r543;\n\tfma.rn.f32 \t%r596, %r205, %r310, %r542;\n\tfma.rn.f32 \t%r595, %r195, %r310, %r541;\n\tfma.rn.f32 \t%r608, %r205, %r313, %r540;\n\tfma.rn.f32 \t%r609, %r215, %r313, %r539;\n\tfma.rn.f32 \t%r610, %r225, %r313, %r538;\n\t.loc\t1 68 18 // test_complex_kernels.py:68:18\n\tadd.s64 \t%rd24, %rd24, 32;\n\t.loc\t1 64 22 // test_complex_kernels.py:64:22\n\tadd.s64 \t%rd23, %rd23, %rd3;\n\tadd.s32 \t%r594, %r594, -1;\n\tadd.s32 \t%r593, %r593, -16;\n\tsetp.ne.s32 \t%p5, %r594, 0;\n\t@%p5 bra \t$L__BB0_3;\n// %bb.4: // %._crit_edge.loopexit\n\t.loc\t1 70 23 // test_complex_kernels.py:70:23\n\tcvt.rn.f16.f32 \t%rs25, %r595;\n\tcvt.rn.f16.f32 \t%rs26, %r596;\n\tmov.b32 \t%r612, {%rs25, %rs26};\n\tcvt.rn.f16.f32 \t%rs27, %r597;\n\tcvt.rn.f16.f32 \t%rs28, %r598;\n\tmov.b32 \t%r613, {%rs27, %rs28};\n\tcvt.rn.f16.f32 \t%rs29, %r599;\n\tcvt.rn.f16.f32 \t%rs30, %r600;\n\tmov.b32 \t%r614, {%rs29, %rs30};\n\tcvt.rn.f16.f32 \t%rs31, %r601;\n\tcvt.rn.f16.f32 \t%rs32, %r602;\n\tmov.b32 \t%r615, {%rs31, %rs32};\n\tcvt.rn.f16.f32 \t%rs33, %r603;\n\tcvt.rn.f16.f32 \t%rs34, %r604;\n\tmov.b32 \t%r616, {%rs33, %rs34};\n\tcvt.rn.f16.f32 \t%rs35, %r605;\n\tcvt.rn.f16.f32 \t%rs36, %r606;\n\tmov.b32 \t%r617, {%rs35, %rs36};\n\tcvt.rn.f16.f32 \t%rs37, %r607;\n\tcvt.rn.f16.f32 \t%rs38, %r608;\n\tmov.b32 \t%r618, {%rs37, %rs38};\n\tcvt.rn.f16.f32 \t%rs39, %r609;\n\tcvt.rn.f16.f32 \t%rs40, %r610;\n\tmov.b32 \t%r619, {%rs39, %rs40};\n\tbra.uni \t$L__BB0_5;\n$L__BB0_1: // %.._crit_edge_crit_edge\n\t.loc\t1 76 21 // test_complex_kernels.py:76:21\n\tshl.b32 \t%r611, %r2, 4;\n\tmov.b32 \t%r612, 0;\n\tmov.b32 \t%r613, %r612;\n\tmov.b32 \t%r614, %r612;\n\tmov.b32 \t%r615, %r612;\n\tmov.b32 \t%r616, %r612;\n\tmov.b32 \t%r617, %r612;\n\tmov.b32 \t%r618, %r612;\n\tmov.b32 \t%r619, %r612;\n$L__BB0_5: // %._crit_edge\n\t.loc\t1 57 38 // test_complex_kernels.py:57:38\n\tor.b32 \t%r562, %r1, %r4;\n\tor.b32 \t%r563, %r1, %r3;\n\t.loc\t1 74 33 // test_complex_kernels.py:74:33\n\tmul.lo.s32 \t%r564, %r563, %r73;\n\tmul.lo.s32 \t%r565, %r562, %r73;\n\t.loc\t1 74 21 // test_complex_kernels.py:74:21\n\tmul.wide.s32 \t%rd18, %r564, 2;\n\tadd.s64 \t%rd19, %rd11, %rd18;\n\tmul.wide.s32 \t%rd20, %r565, 2;\n\tadd.s64 \t%rd21, %rd11, %rd20;\n\t.loc\t1 74 52 // test_complex_kernels.py:74:52\n\tmul.wide.s32 \t%rd22, %r6, 2;\n\tadd.s64 \t%rd16, %rd19, %rd22;\n\tadd.s64 \t%rd17, %rd21, %rd22;\n\t.loc\t1 75 33 // test_complex_kernels.py:75:33\n\tsetp.lt.s32 \t%p8, %r563, %r68;\n\tsetp.lt.s32 \t%p9, %r562, %r68;\n\t.loc\t1 75 58 // test_complex_kernels.py:75:58\n\tsetp.lt.s32 \t%p10, %r6, %r69;\n\t.loc\t1 75 39 // test_complex_kernels.py:75:39\n\tand.pred \t%p6, %p8, %p10;\n\tand.pred \t%p7, %p9, %p10;\n\t.loc\t1 76 21 // test_complex_kernels.py:76:21\n\tbar.sync \t0;\n\tand.b32 \t%r566, %r611, 96;\n\tand.b32 \t%r567, %r7, 136;\n\tor.b32 \t%r568, %r566, %r567;\n\tshl.b32 \t%r569, %r2, 1;\n\tand.b32 \t%r570, %r569, 16;\n\tor.b32 \t%r571, %r568, %r570;\n\tadd.s32 \t%r573, %r592, %r571;\n\tmov.b32 \t{%rs41, %rs42}, %r612;\n\tmov.b32 \t{%rs43, %rs44}, %r614;\n\tmov.b32 \t{%rs45, %rs46}, %r616;\n\tmov.b32 \t{%rs47, %rs48}, %r618;\n\tst.shared.v4.b16 \t[%r573], {%rs41, %rs43, %rs45, %rs47};\n\txor.b32 \t%r574, %r571, 272;\n\tadd.s32 \t%r575, %r592, %r574;\n\tst.shared.v4.b16 \t[%r575], {%rs42, %rs44, %rs46, %rs48};\n\txor.b32 \t%r576, %r571, 520;\n\tadd.s32 \t%r577, %r592, %r576;\n\tmov.b32 \t{%rs49, %rs50}, %r613;\n\tmov.b32 \t{%rs51, %rs52}, %r615;\n\tmov.b32 \t{%rs53, %rs54}, %r617;\n\tmov.b32 \t{%rs55, %rs56}, %r619;\n\tst.shared.v4.b16 \t[%r577], {%rs49, %rs51, %rs53, %rs55};\n\txor.b32 \t%r578, %r571, 792;\n\tadd.s32 \t%r579, %r592, %r578;\n\tst.shared.v4.b16 \t[%r579], {%rs50, %rs52, %rs54, %rs56};\n\tbar.sync \t0;\n\tshl.b32 \t%r580, %r2, 6;\n\tand.b32 \t%r581, %r580, 768;\n\tshl.b32 \t%r582, %r5, 5;\n\tor.b32 \t%r583, %r581, %r582;\n\tshl.b32 \t%r584, %r2, 2;\n\tand.b32 \t%r585, %r584, 16;\n\tor.b32 \t%r586, %r583, %r585;\n\tand.b32 \t%r587, %r2, 24;\n\txor.b32 \t%r588, %r586, %r587;\n\tadd.s32 \t%r589, %r592, %r588;\n\tld.shared.v2.b32 \t{%r558, %r559}, [%r589+128];\n\txor.b32 \t%r590, %r588, 8;\n\tadd.s32 \t%r591, %r592, %r590;\n\tld.shared.v2.b32 \t{%r560, %r561}, [%r591+128];\n\tld.shared.v2.b32 \t{%r554, %r555}, [%r589];\n\tld.shared.v2.b32 \t{%r556, %r557}, [%r591];\n\t// begin inline asm\n\t@%p6 st.global.v4.b32 [ %rd16 + 0 ], { %r554, %r555, %r556, %r557 };\n\t// end inline asm\n\t// begin inline asm\n\t@%p7 st.global.v4.b32 [ %rd17 + 0 ], { %r558, %r559, %r560, %r561 };\n\t// end inline asm\n\t.loc\t1 76 4 // test_complex_kernels.py:76:4\n\tret;\n$L__tmp6:\n$L__func_end0:\n // -- End function\n}\n\t.file\t1 \"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\"\n\t.file\t2 \"/scratch/findhao/pta/triton/python/triton/language/standard.py\"\n\t.section\t.debug_abbrev\n\t{\n.b8 1 // Abbreviation Code\n.b8 17 // DW_TAG_compile_unit\n.b8 1 // DW_CHILDREN_yes\n.b8 37 // DW_AT_producer\n.b8 8 // DW_FORM_string\n.b8 19 // DW_AT_language\n.b8 5 // DW_FORM_data2\n.b8 3 // DW_AT_name\n.b8 8 // DW_FORM_string\n.b8 16 // DW_AT_stmt_list\n.b8 6 // DW_FORM_data4\n.b8 27 // DW_AT_comp_dir\n.b8 8 // DW_FORM_string\n.b8 0 // EOM(1)\n.b8 0 // EOM(2)\n.b8 2 // Abbreviation Code\n.b8 46 // DW_TAG_subprogram\n.b8 0 // DW_CHILDREN_no\n.b8 3 // DW_AT_name\n.b8 8 // DW_FORM_string\n.b8 32 // DW_AT_inline\n.b8 11 // DW_FORM_data1\n.b8 0 // EOM(1)\n.b8 0 // EOM(2)\n.b8 3 // Abbreviation Code\n.b8 46 // DW_TAG_subprogram\n.b8 1 // DW_CHILDREN_yes\n.b8 17 // DW_AT_low_pc\n.b8 1 // DW_FORM_addr\n.b8 18 // DW_AT_high_pc\n.b8 1 // DW_FORM_addr\n.b8 49 // DW_AT_abstract_origin\n.b8 19 // DW_FORM_ref4\n.b8 0 // EOM(1)\n.b8 0 // EOM(2)\n.b8 4 // Abbreviation Code\n.b8 29 // DW_TAG_inlined_subroutine\n.b8 0 // DW_CHILDREN_no\n.b8 49 // DW_AT_abstract_origin\n.b8 19 // DW_FORM_ref4\n.b8 17 // DW_AT_low_pc\n.b8 1 // DW_FORM_addr\n.b8 18 // DW_AT_high_pc\n.b8 1 // DW_FORM_addr\n.b8 88 // DW_AT_call_file\n.b8 11 // DW_FORM_data1\n.b8 89 // DW_AT_call_line\n.b8 11 // DW_FORM_data1\n.b8 87 // DW_AT_call_column\n.b8 11 // DW_FORM_data1\n.b8 0 // EOM(1)\n.b8 0 // EOM(2)\n.b8 0 // EOM(3)\n\t}\n\t.section\t.debug_info\n\t{\n.b32 191 // Length of Unit\n.b8 2 // DWARF version number\n.b8 0\n.b32 .debug_abbrev // Offset Into Abbrev. Section\n.b8 8 // Address Size (in bytes)\n.b8 1 // Abbrev [1] 0xb:0xb8 DW_TAG_compile_unit\n.b8 116 // DW_AT_producer\n.b8 114\n.b8 105\n.b8 116\n.b8 111\n.b8 110\n.b8 0\n.b8 2 // DW_AT_language\n.b8 0\n.b8 116 // DW_AT_name\n.b8 101\n.b8 115\n.b8 116\n.b8 95\n.b8 99\n.b8 111\n.b8 109\n.b8 112\n.b8 108\n.b8 101\n.b8 120\n.b8 95\n.b8 107\n.b8 101\n.b8 114\n.b8 110\n.b8 101\n.b8 108\n.b8 115\n.b8 46\n.b8 112\n.b8 121\n.b8 0\n.b32 .debug_line // DW_AT_stmt_list\n.b8 47 // DW_AT_comp_dir\n.b8 115\n.b8 99\n.b8 114\n.b8 97\n.b8 116\n.b8 99\n.b8 104\n.b8 47\n.b8 102\n.b8 105\n.b8 110\n.b8 100\n.b8 104\n.b8 97\n.b8 111\n.b8 47\n.b8 116\n.b8 114\n.b8 105\n.b8 116\n.b8 111\n.b8 110\n.b8 112\n.b8 97\n.b8 114\n.b8 115\n.b8 101\n.b8 47\n.b8 116\n.b8 101\n.b8 115\n.b8 116\n.b8 115\n.b8 0\n.b8 2 // Abbrev [2] 0x54:0x10 DW_TAG_subprogram\n.b8 109 // DW_AT_name\n.b8 97\n.b8 116\n.b8 109\n.b8 117\n.b8 108\n.b8 95\n.b8 107\n.b8 101\n.b8 114\n.b8 110\n.b8 101\n.b8 108\n.b8 0\n.b8 1 // DW_AT_inline\n.b8 3 // Abbrev [3] 0x64:0x5e DW_TAG_subprogram\n.b64 $L__func_begin0 // DW_AT_low_pc\n.b64 $L__func_end0 // DW_AT_high_pc\n.b32 84 // DW_AT_abstract_origin\n.b8 4 // Abbrev [4] 0x79:0x18 DW_TAG_inlined_subroutine\n.b32 84 // DW_AT_abstract_origin\n.b64 $L__tmp1 // DW_AT_low_pc\n.b64 $L__tmp2 // DW_AT_high_pc\n.b8 1 // DW_AT_call_file\n.b8 48 // DW_AT_call_line\n.b8 27 // DW_AT_call_column\n.b8 4 // Abbrev [4] 0x91:0x18 DW_TAG_inlined_subroutine\n.b32 84 // DW_AT_abstract_origin\n.b64 $L__tmp2 // DW_AT_low_pc\n.b64 $L__tmp3 // DW_AT_high_pc\n.b8 1 // DW_AT_call_file\n.b8 49 // DW_AT_call_line\n.b8 27 // DW_AT_call_column\n.b8 4 // Abbrev [4] 0xa9:0x18 DW_TAG_inlined_subroutine\n.b32 84 // DW_AT_abstract_origin\n.b64 $L__tmp4 // DW_AT_low_pc\n.b64 $L__tmp5 // DW_AT_high_pc\n.b8 1 // DW_AT_call_file\n.b8 64 // DW_AT_call_line\n.b8 33 // DW_AT_call_column\n.b8 0 // End Of Children Mark\n.b8 0 // End Of Children Mark\n\t}\n\t.section\t.debug_macinfo\t{\t}\n","matmul_kernel.json":"{\"hash\": \"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26\", \"target\": {\"backend\": \"cuda\", \"arch\": 75, \"warp_size\": 32}, \"num_warps\": 1, \"num_ctas\": 1, \"num_stages\": 1, \"warp_size\": 32, \"maxnreg\": null, \"cluster_dims\": [1, 1, 1], \"ptx_version\": null, \"ptx_options\": null, \"ir_override\": null, \"enable_fp_fusion\": true, \"launch_cooperative_grid\": false, \"launch_pdl\": false, \"supported_fp8_dtypes\": [\"fp8e4b15\", \"fp8e5\"], \"deprecated_fp8_dot_operand_dtypes\": [], \"default_dot_input_precision\": \"tf32\", \"allowed_dot_input_precisions\": [\"tf32\", \"tf32x3\", \"ieee\"], \"max_num_imprecise_acc_default\": 0, \"extern_libs\": [[\"libdevice\", \"/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc\"]], \"debug\": false, \"backend_name\": \"cuda\", \"sanitize_overflow\": true, \"arch\": \"sm75\", \"triton_version\": \"3.4.0\", \"tensordesc_meta\": [], \"shared\": 2048, \"tmem_size\": 0, \"global_scratch_size\": 0, \"global_scratch_align\": 1, \"name\": \"matmul_kernel\"}"},"python_source":{"file_path":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","start_line":29,"end_line":77,"code":"@triton.autotune(\n configs=[\n triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 1}, num_stages=1, num_warps=1),\n triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 1}, num_stages=1, num_warps=1),\n triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 1}, num_stages=1, num_warps=1),\n ],\n key=['M', 'N', 'K'],\n)\n@triton.jit\ndef matmul_kernel(\n a_ptr, b_ptr, c_ptr,\n M, N, K,\n stride_am, stride_ak,\n stride_bk, stride_bn,\n stride_cm, stride_cn,\n BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,\n GROUP_SIZE_M: tl.constexpr,\n):\n pid = tl.program_id(axis=0)\n num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)\n num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)\n num_pid_in_group = GROUP_SIZE_M * num_pid_n\n group_id = pid // num_pid_in_group\n first_pid_m = group_id * GROUP_SIZE_M\n group_size = min(num_pid_m - first_pid_m, GROUP_SIZE_M)\n pid_m = first_pid_m + (pid % group_size)\n pid_n = (pid % num_pid_in_group) // group_size\n\n offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M\n offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N\n offs_k = tl.arange(0, BLOCK_SIZE_K)\n a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)\n b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)\n\n accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)\n for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):\n a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)\n b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)\n accumulator += tl.dot(a, b)\n a_ptrs += BLOCK_SIZE_K * stride_ak\n b_ptrs += BLOCK_SIZE_K * stride_bk\n c = accumulator.to(tl.float16)\n\n offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)\n offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)\n c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]\n c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)\n tl.store(c_ptrs, c, mask=c_mask)\n"},"times":{"ir_initialization":803,"lowering_stages":[],"store_results":0}}} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":null,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":149,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00000"},"b_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00200"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac00400"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":129,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c1 = matmul(a1, b1)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":250,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"ret = self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":149,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":149,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":149,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[2],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[4],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00600"},"b_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac00a00"},"c_ptr":{"type":"tensor","shape":[32,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":1024,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":2048,"data_ptr":"0x7e744ac00e00"},"M":{"type":"int","value":32},"N":{"type":"int","value":32},"K":{"type":"int","value":16},"stride_am":{"type":"int","value":16},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":32},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":32},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":136,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c2 = matmul(a2, b2)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":250,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"ret = self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":149,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431739328448,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"72d6784240288dd92f97cdba9bcf5c9e5c375421e627b69d3cbcdc96e3885b00","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":1024,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":149,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769347952,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e3f911679811bf960275b0153a03c81ba6c52d64f8e181258de2907e207490a7","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":32},"BLOCK_SIZE_N":{"type":"int","value":16},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":149,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":160,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":172,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:06.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":238,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"benchmark()"},{"line":227,"name":"benchmark","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":227,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}"},{"line":162,"name":"_bench","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))"},{"line":185,"name":"do_bench","filename":"/scratch/findhao/pta/triton/python/triton/testing.py","loc":"fn()"},{"line":148,"name":"kernel_call","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"matmul_kernel","function":98431769671488,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"e648e0ca045fee328fafb3287222ded88fd50fc1975ba772e1c38b148a48dd26","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"matmul_kernel","num_ctas":1,"num_stages":1,"num_warps":1,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":2048,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[16,32],"dtype":"torch.float16","device":"cuda:0","stride":[32,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01600"},"b_ptr":{"type":"tensor","shape":[32,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":512,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":1024,"data_ptr":"0x7e744ac01a00"},"c_ptr":{"type":"tensor","shape":[16,16],"dtype":"torch.float16","device":"cuda:0","stride":[16,1],"numel":256,"is_contiguous":true,"element_size":2,"storage_offset":0,"memory_usage":512,"data_ptr":"0x7e744ac01e00"},"M":{"type":"int","value":16},"N":{"type":"int","value":16},"K":{"type":"int","value":32},"stride_am":{"type":"int","value":32},"stride_ak":{"type":"int","value":1},"stride_bk":{"type":"int","value":16},"stride_bn":{"type":"int","value":1},"stride_cm":{"type":"int","value":16},"stride_cn":{"type":"int","value":1},"BLOCK_SIZE_M":{"type":"int","value":16},"BLOCK_SIZE_N":{"type":"int","value":32},"BLOCK_SIZE_K":{"type":"int","value":16},"GROUP_SIZE_M":{"type":"int","value":1}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":143,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"c3 = matmul(a3, b3)"},{"line":84,"name":"matmul","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"matmul_kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1))"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":250,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/autotuner.py","loc":"ret = self.fn.run("},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"compilation","pid":171439,"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":155,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"out1 = fused_op(x, y, z, scale_factor=1.0, activation=\"none\")"},{"line":116,"name":"fused_op","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"fused_op_kernel[grid](a, b, c, output, n_elements, scale_factor, ACTIVATION=activation, BLOCK_SIZE=BLOCK_SIZE)"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":593,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel = self._do_compile(key, signature, device, constexprs, options, attrs, warmup)"},{"line":773,"name":"_do_compile","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel = self.compile(src, target=target, options=options.__dict__)"},{"line":267,"name":"compile","filename":"/scratch/findhao/pta/triton/python/triton/compiler/compiler.py","loc":"compilation_listener("},{"line":752,"name":"maybe_trace_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton("},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ","payload":{"metadata":{"cache_hit":true,"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"707cff483ced756b01ffb6c9ce08096adcb2997575b87d40a9b91e567b955d70","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"fused_op_kernel","num_ctas":1,"num_stages":3,"num_warps":4,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":0,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32,"env":{},"src_attrs":{"(0,)":[["tt.divisibility",16]],"(1,)":[["tt.divisibility",16]],"(2,)":[["tt.divisibility",16]],"(3,)":[["tt.divisibility",16]],"(4,)":[],"(6,)":[]},"src_cache_key":"3a0d6e78c4ea91afcbe3e257a4fc286afdba531229fcfd7e93314f8100fc0d32","src_constants":{"(6,)":"none","(7,)":8}},"file_path":{"fused_op_kernel.source":"/home/findhao/.triton/cache/OB6P6SB45V2WWAP7W3E44CAJNLOLFGLVOW4H2QFJXEPFM64VLVYA/fused_op_kernel.source","fused_op_kernel.ttir":"/home/findhao/.triton/cache/OB6P6SB45V2WWAP7W3E44CAJNLOLFGLVOW4H2QFJXEPFM64VLVYA/fused_op_kernel.ttir","fused_op_kernel.ttgir":"/home/findhao/.triton/cache/OB6P6SB45V2WWAP7W3E44CAJNLOLFGLVOW4H2QFJXEPFM64VLVYA/fused_op_kernel.ttgir","fused_op_kernel.llir":"/home/findhao/.triton/cache/OB6P6SB45V2WWAP7W3E44CAJNLOLFGLVOW4H2QFJXEPFM64VLVYA/fused_op_kernel.llir","fused_op_kernel.ptx":"/home/findhao/.triton/cache/OB6P6SB45V2WWAP7W3E44CAJNLOLFGLVOW4H2QFJXEPFM64VLVYA/fused_op_kernel.ptx","fused_op_kernel.cubin":"/home/findhao/.triton/cache/OB6P6SB45V2WWAP7W3E44CAJNLOLFGLVOW4H2QFJXEPFM64VLVYA/fused_op_kernel.cubin","fused_op_kernel.json":"/home/findhao/.triton/cache/OB6P6SB45V2WWAP7W3E44CAJNLOLFGLVOW4H2QFJXEPFM64VLVYA/fused_op_kernel.json"},"file_content":{"fused_op_kernel.ttir":"#loc = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0)\nmodule {\n tt.func public @fused_op_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg1: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg2: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg3: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg4: i32 loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg5: f32 loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0)) attributes {noinline = false} {\n %c8_i32 = arith.constant 8 : i32 loc(#loc1)\n %0 = tt.get_program_id x : i32 loc(#loc2)\n %1 = arith.muli %0, %c8_i32 : i32 loc(#loc3)\n %2 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc4)\n %3 = tt.splat %1 : i32 -> tensor<8xi32> loc(#loc5)\n %4 = arith.addi %3, %2 : tensor<8xi32> loc(#loc5)\n %5 = tt.splat %arg4 : i32 -> tensor<8xi32> loc(#loc6)\n %6 = arith.cmpi slt, %4, %5 : tensor<8xi32> loc(#loc6)\n %7 = tt.splat %arg0 : !tt.ptr -> tensor<8x!tt.ptr> loc(#loc7)\n %8 = tt.addptr %7, %4 : tensor<8x!tt.ptr>, tensor<8xi32> loc(#loc7)\n %9 = tt.load %8, %6 : tensor<8x!tt.ptr> loc(#loc8)\n %10 = tt.splat %arg1 : !tt.ptr -> tensor<8x!tt.ptr> loc(#loc9)\n %11 = tt.addptr %10, %4 : tensor<8x!tt.ptr>, tensor<8xi32> loc(#loc9)\n %12 = tt.load %11, %6 : tensor<8x!tt.ptr> loc(#loc10)\n %13 = tt.splat %arg2 : !tt.ptr -> tensor<8x!tt.ptr> loc(#loc11)\n %14 = tt.addptr %13, %4 : tensor<8x!tt.ptr>, tensor<8xi32> loc(#loc11)\n %15 = tt.load %14, %6 : tensor<8x!tt.ptr> loc(#loc12)\n %16 = arith.mulf %9, %12 : tensor<8xf32> loc(#loc13)\n %17 = tt.splat %arg5 : f32 -> tensor<8xf32> loc(#loc14)\n %18 = arith.mulf %16, %17 : tensor<8xf32> loc(#loc14)\n %19 = arith.addf %18, %15 : tensor<8xf32> loc(#loc15)\n %20 = tt.splat %arg3 : !tt.ptr -> tensor<8x!tt.ptr> loc(#loc16)\n %21 = tt.addptr %20, %4 : tensor<8x!tt.ptr>, tensor<8xi32> loc(#loc16)\n tt.store %21, %19, %6 : tensor<8x!tt.ptr> loc(#loc17)\n tt.return loc(#loc18)\n } loc(#loc)\n} loc(#loc)\n#loc1 = loc(unknown)\n#loc2 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":97:24)\n#loc3 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":98:20)\n#loc4 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":98:46)\n#loc5 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":98:33)\n#loc6 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":99:21)\n#loc7 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":101:24)\n#loc8 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":101:16)\n#loc9 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":102:24)\n#loc10 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":102:16)\n#loc11 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":103:24)\n#loc12 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":103:16)\n#loc13 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":105:17)\n#loc14 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":105:21)\n#loc15 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":105:36)\n#loc16 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":109:26)\n#loc17 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":109:35)\n#loc18 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":109:4)\n","fused_op_kernel.ttgir":"#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>\n#loc = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0)\nmodule attributes {\"ttg.num-ctas\" = 1 : i32, \"ttg.num-warps\" = 4 : i32, ttg.target = \"cuda:75\", \"ttg.threads-per-warp\" = 32 : i32} {\n tt.func public @fused_op_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg1: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg2: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg3: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg4: i32 loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg5: f32 loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0)) attributes {noinline = false} {\n %c8_i32 = arith.constant 8 : i32 loc(#loc1)\n %0 = tt.get_program_id x : i32 loc(#loc2)\n %1 = arith.muli %0, %c8_i32 : i32 loc(#loc3)\n %2 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #blocked> loc(#loc4)\n %3 = tt.splat %1 : i32 -> tensor<8xi32, #blocked> loc(#loc5)\n %4 = arith.addi %3, %2 : tensor<8xi32, #blocked> loc(#loc5)\n %5 = tt.splat %arg4 : i32 -> tensor<8xi32, #blocked> loc(#loc6)\n %6 = arith.cmpi slt, %4, %5 : tensor<8xi32, #blocked> loc(#loc6)\n %7 = tt.splat %arg0 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> loc(#loc7)\n %8 = tt.addptr %7, %4 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> loc(#loc7)\n %9 = tt.load %8, %6 : tensor<8x!tt.ptr, #blocked> loc(#loc8)\n %10 = tt.splat %arg1 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> loc(#loc9)\n %11 = tt.addptr %10, %4 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> loc(#loc9)\n %12 = tt.load %11, %6 : tensor<8x!tt.ptr, #blocked> loc(#loc10)\n %13 = tt.splat %arg2 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> loc(#loc11)\n %14 = tt.addptr %13, %4 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> loc(#loc11)\n %15 = tt.load %14, %6 : tensor<8x!tt.ptr, #blocked> loc(#loc12)\n %16 = arith.mulf %9, %12 : tensor<8xf32, #blocked> loc(#loc13)\n %17 = tt.splat %arg5 : f32 -> tensor<8xf32, #blocked> loc(#loc14)\n %18 = arith.mulf %16, %17 : tensor<8xf32, #blocked> loc(#loc14)\n %19 = arith.addf %18, %15 : tensor<8xf32, #blocked> loc(#loc15)\n %20 = tt.splat %arg3 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> loc(#loc16)\n %21 = tt.addptr %20, %4 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> loc(#loc16)\n tt.store %21, %19, %6 : tensor<8x!tt.ptr, #blocked> loc(#loc17)\n tt.return loc(#loc18)\n } loc(#loc)\n} loc(#loc)\n#loc1 = loc(unknown)\n#loc2 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":97:24)\n#loc3 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":98:20)\n#loc4 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":98:46)\n#loc5 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":98:33)\n#loc6 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":99:21)\n#loc7 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":101:24)\n#loc8 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":101:16)\n#loc9 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":102:24)\n#loc10 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":102:16)\n#loc11 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":103:24)\n#loc12 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":103:16)\n#loc13 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":105:17)\n#loc14 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":105:21)\n#loc15 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":105:36)\n#loc16 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":109:26)\n#loc17 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":109:35)\n#loc18 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":109:4)\n","fused_op_kernel.llir":"; ModuleID = 'LLVMDialectModule'\nsource_filename = \"LLVMDialectModule\"\ntarget datalayout = \"e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64\"\n\ndefine ptx_kernel void @fused_op_kernel(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, float %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !5 {\n %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8\n %9 = shl i32 %8, 3, !dbg !9\n %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10\n %11 = and i32 %10, 7, !dbg !10\n %12 = or disjoint i32 %9, %11, !dbg !11\n %13 = icmp slt i32 %12, %4, !dbg !12\n %14 = sext i32 %12 to i64, !dbg !13\n %15 = getelementptr float, ptr addrspace(1) %0, i64 %14, !dbg !13\n %16 = tail call i32 asm sideeffect \"mov.u32 $0, 0x0;\\0A\\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];\", \"=r,l,b\"(ptr addrspace(1) %15, i1 %13) #2, !dbg !14\n %17 = bitcast i32 %16 to float, !dbg !14\n %18 = getelementptr float, ptr addrspace(1) %1, i64 %14, !dbg !15\n %19 = tail call i32 asm sideeffect \"mov.u32 $0, 0x0;\\0A\\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];\", \"=r,l,b\"(ptr addrspace(1) %18, i1 %13) #2, !dbg !16\n %20 = bitcast i32 %19 to float, !dbg !16\n %21 = getelementptr float, ptr addrspace(1) %2, i64 %14, !dbg !17\n %22 = tail call i32 asm sideeffect \"mov.u32 $0, 0x0;\\0A\\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];\", \"=r,l,b\"(ptr addrspace(1) %21, i1 %13) #2, !dbg !18\n %23 = bitcast i32 %22 to float, !dbg !18\n %24 = fmul float %17, %20, !dbg !19\n %25 = fmul float %5, %24, !dbg !20\n %26 = fadd float %25, %23, !dbg !21\n %27 = getelementptr float, ptr addrspace(1) %3, i64 %14, !dbg !22\n %28 = and i32 %10, 120, !dbg !23\n %29 = icmp eq i32 %28, 0, !dbg !23\n %30 = bitcast float %26 to i32, !dbg !23\n %31 = and i1 %29, %13, !dbg !23\n tail call void asm sideeffect \"@$2 st.global.b32 [ $1 + 0 ], { $0 };\", \"r,l,b\"(i32 %30, ptr addrspace(1) %27, i1 %31) #2, !dbg !23\n ret void, !dbg !24\n}\n\n; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1\n\n; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1\n\nattributes #0 = { \"nvvm.reqntid\"=\"128\" }\nattributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }\nattributes #2 = { nounwind }\n\n!llvm.dbg.cu = !{!0}\n!llvm.module.flags = !{!2, !3}\n!llvm.ident = !{!4}\n\n!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: \"triton\", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)\n!1 = !DIFile(filename: \"test_complex_kernels.py\", directory: \"/scratch/findhao/tritonparse/tests\")\n!2 = !{i32 2, !\"Debug Info Version\", i32 3}\n!3 = !{i32 4, !\"nvvm-reflect-ftz\", i32 1}\n!4 = !{!\"clang version 3.8.0 (tags/RELEASE_380/final)\"}\n!5 = distinct !DISubprogram(name: \"fused_op_kernel\", linkageName: \"fused_op_kernel\", scope: !1, file: !1, line: 90, type: !6, scopeLine: 90, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)\n!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)\n!7 = !{}\n!8 = !DILocation(line: 97, column: 24, scope: !5)\n!9 = !DILocation(line: 98, column: 20, scope: !5)\n!10 = !DILocation(line: 98, column: 46, scope: !5)\n!11 = !DILocation(line: 98, column: 33, scope: !5)\n!12 = !DILocation(line: 99, column: 21, scope: !5)\n!13 = !DILocation(line: 101, column: 24, scope: !5)\n!14 = !DILocation(line: 101, column: 16, scope: !5)\n!15 = !DILocation(line: 102, column: 24, scope: !5)\n!16 = !DILocation(line: 102, column: 16, scope: !5)\n!17 = !DILocation(line: 103, column: 24, scope: !5)\n!18 = !DILocation(line: 103, column: 16, scope: !5)\n!19 = !DILocation(line: 105, column: 17, scope: !5)\n!20 = !DILocation(line: 105, column: 21, scope: !5)\n!21 = !DILocation(line: 105, column: 36, scope: !5)\n!22 = !DILocation(line: 109, column: 26, scope: !5)\n!23 = !DILocation(line: 109, column: 35, scope: !5)\n!24 = !DILocation(line: 109, column: 4, scope: !5)\n","fused_op_kernel.ptx":"//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 8.7\n.target sm_75\n.address_size 64\n\n\t// .globl\tfused_op_kernel // -- Begin function fused_op_kernel\n // @fused_op_kernel\n.visible .entry fused_op_kernel(\n\t.param .u64 .ptr .global .align 1 fused_op_kernel_param_0,\n\t.param .u64 .ptr .global .align 1 fused_op_kernel_param_1,\n\t.param .u64 .ptr .global .align 1 fused_op_kernel_param_2,\n\t.param .u64 .ptr .global .align 1 fused_op_kernel_param_3,\n\t.param .u32 fused_op_kernel_param_4,\n\t.param .f32 fused_op_kernel_param_5,\n\t.param .u64 .ptr .global .align 1 fused_op_kernel_param_6\n)\n.reqntid 128\n{\n\t.reg .pred \t%p<6>;\n\t.reg .b32 \t%r<14>;\n\t.reg .b64 \t%rd<10>;\n\t.loc\t1 90 0 // test_complex_kernels.py:90:0\n$L__func_begin0:\n\t.loc\t1 90 0 // test_complex_kernels.py:90:0\n\n// %bb.0:\n\tld.param.b64 \t%rd5, [fused_op_kernel_param_0];\n\tld.param.b64 \t%rd6, [fused_op_kernel_param_1];\n$L__tmp0:\n\t.loc\t1 97 24 // test_complex_kernels.py:97:24\n\tmov.u32 \t%r5, %ctaid.x;\n\t.loc\t1 98 20 // test_complex_kernels.py:98:20\n\tshl.b32 \t%r6, %r5, 3;\n\tld.param.b64 \t%rd7, [fused_op_kernel_param_2];\n\tld.param.b64 \t%rd8, [fused_op_kernel_param_3];\n\t.loc\t1 98 46 // test_complex_kernels.py:98:46\n\tmov.u32 \t%r7, %tid.x;\n\tand.b32 \t%r8, %r7, 7;\n\tld.param.b32 \t%r9, [fused_op_kernel_param_4];\n\t.loc\t1 98 33 // test_complex_kernels.py:98:33\n\tor.b32 \t%r10, %r6, %r8;\n\tld.param.b32 \t%r11, [fused_op_kernel_param_5];\n\t.loc\t1 99 21 // test_complex_kernels.py:99:21\n\tsetp.lt.s32 \t%p1, %r10, %r9;\n\t.loc\t1 101 24 // test_complex_kernels.py:101:24\n\tmul.wide.s32 \t%rd9, %r10, 4;\n\tadd.s64 \t%rd1, %rd5, %rd9;\n\t.loc\t1 101 16 // test_complex_kernels.py:101:16\n\t// begin inline asm\n\tmov.u32 %r1, 0x0;\n\t@%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ];\n\t// end inline asm\n\t.loc\t1 102 24 // test_complex_kernels.py:102:24\n\tadd.s64 \t%rd2, %rd6, %rd9;\n\t.loc\t1 102 16 // test_complex_kernels.py:102:16\n\t// begin inline asm\n\tmov.u32 %r2, 0x0;\n\t@%p1 ld.global.b32 { %r2 }, [ %rd2 + 0 ];\n\t// end inline asm\n\t.loc\t1 103 24 // test_complex_kernels.py:103:24\n\tadd.s64 \t%rd3, %rd7, %rd9;\n\t.loc\t1 103 16 // test_complex_kernels.py:103:16\n\t// begin inline asm\n\tmov.u32 %r3, 0x0;\n\t@%p1 ld.global.b32 { %r3 }, [ %rd3 + 0 ];\n\t// end inline asm\n\t.loc\t1 105 17 // test_complex_kernels.py:105:17\n\tmul.f32 \t%r12, %r1, %r2;\n\t.loc\t1 105 36 // test_complex_kernels.py:105:36\n\tfma.rn.f32 \t%r4, %r11, %r12, %r3;\n\t.loc\t1 109 26 // test_complex_kernels.py:109:26\n\tadd.s64 \t%rd4, %rd8, %rd9;\n\t.loc\t1 109 35 // test_complex_kernels.py:109:35\n\tand.b32 \t%r13, %r7, 120;\n\tsetp.eq.s32 \t%p5, %r13, 0;\n\tand.pred \t%p4, %p5, %p1;\n\t// begin inline asm\n\t@%p4 st.global.b32 [ %rd4 + 0 ], { %r4 };\n\t// end inline asm\n\t.loc\t1 109 4 // test_complex_kernels.py:109:4\n\tret;\n$L__tmp1:\n$L__func_end0:\n // -- End function\n}\n\t.file\t1 \"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\"\n\t.section\t.debug_abbrev\n\t{\n.b8 1 // Abbreviation Code\n.b8 17 // DW_TAG_compile_unit\n.b8 0 // DW_CHILDREN_no\n.b8 37 // DW_AT_producer\n.b8 8 // DW_FORM_string\n.b8 19 // DW_AT_language\n.b8 5 // DW_FORM_data2\n.b8 3 // DW_AT_name\n.b8 8 // DW_FORM_string\n.b8 16 // DW_AT_stmt_list\n.b8 6 // DW_FORM_data4\n.b8 27 // DW_AT_comp_dir\n.b8 8 // DW_FORM_string\n.b8 0 // EOM(1)\n.b8 0 // EOM(2)\n.b8 0 // EOM(3)\n\t}\n\t.section\t.debug_info\n\t{\n.b32 80 // Length of Unit\n.b8 2 // DWARF version number\n.b8 0\n.b32 .debug_abbrev // Offset Into Abbrev. Section\n.b8 8 // Address Size (in bytes)\n.b8 1 // Abbrev [1] 0xb:0x49 DW_TAG_compile_unit\n.b8 116 // DW_AT_producer\n.b8 114\n.b8 105\n.b8 116\n.b8 111\n.b8 110\n.b8 0\n.b8 2 // DW_AT_language\n.b8 0\n.b8 116 // DW_AT_name\n.b8 101\n.b8 115\n.b8 116\n.b8 95\n.b8 99\n.b8 111\n.b8 109\n.b8 112\n.b8 108\n.b8 101\n.b8 120\n.b8 95\n.b8 107\n.b8 101\n.b8 114\n.b8 110\n.b8 101\n.b8 108\n.b8 115\n.b8 46\n.b8 112\n.b8 121\n.b8 0\n.b32 .debug_line // DW_AT_stmt_list\n.b8 47 // DW_AT_comp_dir\n.b8 115\n.b8 99\n.b8 114\n.b8 97\n.b8 116\n.b8 99\n.b8 104\n.b8 47\n.b8 102\n.b8 105\n.b8 110\n.b8 100\n.b8 104\n.b8 97\n.b8 111\n.b8 47\n.b8 116\n.b8 114\n.b8 105\n.b8 116\n.b8 111\n.b8 110\n.b8 112\n.b8 97\n.b8 114\n.b8 115\n.b8 101\n.b8 47\n.b8 116\n.b8 101\n.b8 115\n.b8 116\n.b8 115\n.b8 0\n\t}\n\t.section\t.debug_macinfo\t{\t}\n","fused_op_kernel.json":"{\"hash\": \"707cff483ced756b01ffb6c9ce08096adcb2997575b87d40a9b91e567b955d70\", \"target\": {\"backend\": \"cuda\", \"arch\": 75, \"warp_size\": 32}, \"num_warps\": 4, \"num_ctas\": 1, \"num_stages\": 3, \"warp_size\": 32, \"maxnreg\": null, \"cluster_dims\": [1, 1, 1], \"ptx_version\": null, \"ptx_options\": null, \"ir_override\": null, \"enable_fp_fusion\": true, \"launch_cooperative_grid\": false, \"launch_pdl\": false, \"supported_fp8_dtypes\": [\"fp8e4b15\", \"fp8e5\"], \"deprecated_fp8_dot_operand_dtypes\": [], \"default_dot_input_precision\": \"tf32\", \"allowed_dot_input_precisions\": [\"tf32\", \"tf32x3\", \"ieee\"], \"max_num_imprecise_acc_default\": 0, \"extern_libs\": [[\"libdevice\", \"/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc\"]], \"debug\": false, \"backend_name\": \"cuda\", \"sanitize_overflow\": true, \"arch\": \"sm75\", \"triton_version\": \"3.4.0\", \"tensordesc_meta\": [], \"shared\": 0, \"tmem_size\": 0, \"global_scratch_size\": 0, \"global_scratch_align\": 1, \"name\": \"fused_op_kernel\"}"},"python_source":{"file_path":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","start_line":89,"end_line":110,"code":"@triton.jit\ndef fused_op_kernel(\n a_ptr, b_ptr, c_ptr, output_ptr,\n n_elements,\n scale_factor: float,\n ACTIVATION: tl.constexpr,\n BLOCK_SIZE: tl.constexpr,\n):\n pid = tl.program_id(axis=0)\n offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)\n mask = offsets < n_elements\n\n a = tl.load(a_ptr + offsets, mask=mask)\n b = tl.load(b_ptr + offsets, mask=mask)\n c = tl.load(c_ptr + offsets, mask=mask)\n\n result = a * b * scale_factor + c\n if ACTIVATION == \"relu\":\n result = tl.where(result > 0, result, 0.0)\n \n tl.store(output_ptr + offsets, result, mask=mask)\n"},"times":{"ir_initialization":1388,"lowering_stages":[],"store_results":0}}} +{"event_type":"launch","pid":171439,"name":"fused_op_kernel","function":null,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"707cff483ced756b01ffb6c9ce08096adcb2997575b87d40a9b91e567b955d70","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"fused_op_kernel","num_ctas":1,"num_stages":3,"num_warps":4,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":0,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[8],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":8,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":32,"data_ptr":"0x7e744ac02000"},"b_ptr":{"type":"tensor","shape":[8],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":8,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":32,"data_ptr":"0x7e744ac02200"},"c_ptr":{"type":"tensor","shape":[8],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":8,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":32,"data_ptr":"0x7e744ac02400"},"output_ptr":{"type":"tensor","shape":[8],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":8,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":32,"data_ptr":"0x7e744ac02600"},"n_elements":{"type":"int","value":8},"scale_factor":{"type":"float","value":1.0},"ACTIVATION":{"type":"str","value":"none","length":4},"BLOCK_SIZE":{"type":"int","value":8}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":155,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"out1 = fused_op(x, y, z, scale_factor=1.0, activation=\"none\")"},{"line":116,"name":"fused_op","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"fused_op_kernel[grid](a, b, c, output, n_elements, scale_factor, ACTIVATION=activation, BLOCK_SIZE=BLOCK_SIZE)"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"fused_op_kernel","function":98431773384832,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"707cff483ced756b01ffb6c9ce08096adcb2997575b87d40a9b91e567b955d70","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"fused_op_kernel","num_ctas":1,"num_stages":3,"num_warps":4,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":0,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[8],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":8,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":32,"data_ptr":"0x7e744ac02000"},"b_ptr":{"type":"tensor","shape":[8],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":8,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":32,"data_ptr":"0x7e744ac02200"},"c_ptr":{"type":"tensor","shape":[8],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":8,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":32,"data_ptr":"0x7e744ac02400"},"output_ptr":{"type":"tensor","shape":[8],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":8,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":32,"data_ptr":"0x7e744ac02800"},"n_elements":{"type":"int","value":8},"scale_factor":{"type":"float","value":2.5},"ACTIVATION":{"type":"str","value":"none","length":4},"BLOCK_SIZE":{"type":"int","value":8}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":160,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"out2 = fused_op(x, y, z, scale_factor=2.5, activation=\"none\")"},{"line":116,"name":"fused_op","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"fused_op_kernel[grid](a, b, c, output, n_elements, scale_factor, ACTIVATION=activation, BLOCK_SIZE=BLOCK_SIZE)"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"compilation","pid":171439,"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":165,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"out3 = fused_op(x, y, z, scale_factor=1.0, activation=\"relu\")"},{"line":116,"name":"fused_op","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"fused_op_kernel[grid](a, b, c, output, n_elements, scale_factor, ACTIVATION=activation, BLOCK_SIZE=BLOCK_SIZE)"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":593,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel = self._do_compile(key, signature, device, constexprs, options, attrs, warmup)"},{"line":773,"name":"_do_compile","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel = self.compile(src, target=target, options=options.__dict__)"},{"line":267,"name":"compile","filename":"/scratch/findhao/pta/triton/python/triton/compiler/compiler.py","loc":"compilation_listener("},{"line":752,"name":"maybe_trace_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton("},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ","payload":{"metadata":{"cache_hit":true,"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"b04777ad4c88dc1395109fc4d1af29a6569897496e3e3571dffac94716e9f449","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"fused_op_kernel","num_ctas":1,"num_stages":3,"num_warps":4,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":0,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32,"env":{},"src_attrs":{"(0,)":[["tt.divisibility",16]],"(1,)":[["tt.divisibility",16]],"(2,)":[["tt.divisibility",16]],"(3,)":[["tt.divisibility",16]],"(4,)":[],"(6,)":[]},"src_cache_key":"3a0d6e78c4ea91afcbe3e257a4fc286afdba531229fcfd7e93314f8100fc0d32","src_constants":{"(6,)":"relu","(7,)":8}},"file_path":{"fused_op_kernel.source":"/home/findhao/.triton/cache/WBDXPLKMRDOBHFIQT7CNDLZJUZLJRF2JNY7DK4O77LEUOFXJ6REQ/fused_op_kernel.source","fused_op_kernel.ttir":"/home/findhao/.triton/cache/WBDXPLKMRDOBHFIQT7CNDLZJUZLJRF2JNY7DK4O77LEUOFXJ6REQ/fused_op_kernel.ttir","fused_op_kernel.ttgir":"/home/findhao/.triton/cache/WBDXPLKMRDOBHFIQT7CNDLZJUZLJRF2JNY7DK4O77LEUOFXJ6REQ/fused_op_kernel.ttgir","fused_op_kernel.llir":"/home/findhao/.triton/cache/WBDXPLKMRDOBHFIQT7CNDLZJUZLJRF2JNY7DK4O77LEUOFXJ6REQ/fused_op_kernel.llir","fused_op_kernel.ptx":"/home/findhao/.triton/cache/WBDXPLKMRDOBHFIQT7CNDLZJUZLJRF2JNY7DK4O77LEUOFXJ6REQ/fused_op_kernel.ptx","fused_op_kernel.cubin":"/home/findhao/.triton/cache/WBDXPLKMRDOBHFIQT7CNDLZJUZLJRF2JNY7DK4O77LEUOFXJ6REQ/fused_op_kernel.cubin","fused_op_kernel.json":"/home/findhao/.triton/cache/WBDXPLKMRDOBHFIQT7CNDLZJUZLJRF2JNY7DK4O77LEUOFXJ6REQ/fused_op_kernel.json"},"file_content":{"fused_op_kernel.ttir":"#loc = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0)\nmodule {\n tt.func public @fused_op_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg1: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg2: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg3: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg4: i32 loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg5: f32 loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0)) attributes {noinline = false} {\n %cst = arith.constant dense<0.000000e+00> : tensor<8xf32> loc(#loc1)\n %c8_i32 = arith.constant 8 : i32 loc(#loc1)\n %0 = tt.get_program_id x : i32 loc(#loc2)\n %1 = arith.muli %0, %c8_i32 : i32 loc(#loc3)\n %2 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc4)\n %3 = tt.splat %1 : i32 -> tensor<8xi32> loc(#loc5)\n %4 = arith.addi %3, %2 : tensor<8xi32> loc(#loc5)\n %5 = tt.splat %arg4 : i32 -> tensor<8xi32> loc(#loc6)\n %6 = arith.cmpi slt, %4, %5 : tensor<8xi32> loc(#loc6)\n %7 = tt.splat %arg0 : !tt.ptr -> tensor<8x!tt.ptr> loc(#loc7)\n %8 = tt.addptr %7, %4 : tensor<8x!tt.ptr>, tensor<8xi32> loc(#loc7)\n %9 = tt.load %8, %6 : tensor<8x!tt.ptr> loc(#loc8)\n %10 = tt.splat %arg1 : !tt.ptr -> tensor<8x!tt.ptr> loc(#loc9)\n %11 = tt.addptr %10, %4 : tensor<8x!tt.ptr>, tensor<8xi32> loc(#loc9)\n %12 = tt.load %11, %6 : tensor<8x!tt.ptr> loc(#loc10)\n %13 = tt.splat %arg2 : !tt.ptr -> tensor<8x!tt.ptr> loc(#loc11)\n %14 = tt.addptr %13, %4 : tensor<8x!tt.ptr>, tensor<8xi32> loc(#loc11)\n %15 = tt.load %14, %6 : tensor<8x!tt.ptr> loc(#loc12)\n %16 = arith.mulf %9, %12 : tensor<8xf32> loc(#loc13)\n %17 = tt.splat %arg5 : f32 -> tensor<8xf32> loc(#loc14)\n %18 = arith.mulf %16, %17 : tensor<8xf32> loc(#loc14)\n %19 = arith.addf %18, %15 : tensor<8xf32> loc(#loc15)\n %20 = arith.cmpf ogt, %19, %cst : tensor<8xf32> loc(#loc16)\n %21 = arith.select %20, %19, %cst : tensor<8xi1>, tensor<8xf32> loc(#loc17)\n %22 = tt.splat %arg3 : !tt.ptr -> tensor<8x!tt.ptr> loc(#loc18)\n %23 = tt.addptr %22, %4 : tensor<8x!tt.ptr>, tensor<8xi32> loc(#loc18)\n tt.store %23, %21, %6 : tensor<8x!tt.ptr> loc(#loc19)\n tt.return loc(#loc20)\n } loc(#loc)\n} loc(#loc)\n#loc1 = loc(unknown)\n#loc2 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":97:24)\n#loc3 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":98:20)\n#loc4 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":98:46)\n#loc5 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":98:33)\n#loc6 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":99:21)\n#loc7 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":101:24)\n#loc8 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":101:16)\n#loc9 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":102:24)\n#loc10 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":102:16)\n#loc11 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":103:24)\n#loc12 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":103:16)\n#loc13 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":105:17)\n#loc14 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":105:21)\n#loc15 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":105:36)\n#loc16 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":107:35)\n#loc17 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":107:46)\n#loc18 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":109:26)\n#loc19 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":109:35)\n#loc20 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":109:4)\n","fused_op_kernel.ttgir":"#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>\n#loc = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0)\nmodule attributes {\"ttg.num-ctas\" = 1 : i32, \"ttg.num-warps\" = 4 : i32, ttg.target = \"cuda:75\", \"ttg.threads-per-warp\" = 32 : i32} {\n tt.func public @fused_op_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg1: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg2: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg3: !tt.ptr {tt.divisibility = 16 : i32} loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg4: i32 loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0), %arg5: f32 loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":90:0)) attributes {noinline = false} {\n %c8_i32 = arith.constant 8 : i32 loc(#loc1)\n %cst = arith.constant dense<0.000000e+00> : tensor<8xf32, #blocked> loc(#loc1)\n %0 = tt.get_program_id x : i32 loc(#loc2)\n %1 = arith.muli %0, %c8_i32 : i32 loc(#loc3)\n %2 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #blocked> loc(#loc4)\n %3 = tt.splat %1 : i32 -> tensor<8xi32, #blocked> loc(#loc5)\n %4 = arith.addi %3, %2 : tensor<8xi32, #blocked> loc(#loc5)\n %5 = tt.splat %arg4 : i32 -> tensor<8xi32, #blocked> loc(#loc6)\n %6 = arith.cmpi slt, %4, %5 : tensor<8xi32, #blocked> loc(#loc6)\n %7 = tt.splat %arg0 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> loc(#loc7)\n %8 = tt.addptr %7, %4 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> loc(#loc7)\n %9 = tt.load %8, %6 : tensor<8x!tt.ptr, #blocked> loc(#loc8)\n %10 = tt.splat %arg1 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> loc(#loc9)\n %11 = tt.addptr %10, %4 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> loc(#loc9)\n %12 = tt.load %11, %6 : tensor<8x!tt.ptr, #blocked> loc(#loc10)\n %13 = tt.splat %arg2 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> loc(#loc11)\n %14 = tt.addptr %13, %4 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> loc(#loc11)\n %15 = tt.load %14, %6 : tensor<8x!tt.ptr, #blocked> loc(#loc12)\n %16 = arith.mulf %9, %12 : tensor<8xf32, #blocked> loc(#loc13)\n %17 = tt.splat %arg5 : f32 -> tensor<8xf32, #blocked> loc(#loc14)\n %18 = arith.mulf %16, %17 : tensor<8xf32, #blocked> loc(#loc14)\n %19 = arith.addf %18, %15 : tensor<8xf32, #blocked> loc(#loc15)\n %20 = arith.cmpf ogt, %19, %cst : tensor<8xf32, #blocked> loc(#loc16)\n %21 = arith.select %20, %19, %cst : tensor<8xi1, #blocked>, tensor<8xf32, #blocked> loc(#loc17)\n %22 = tt.splat %arg3 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> loc(#loc18)\n %23 = tt.addptr %22, %4 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> loc(#loc18)\n tt.store %23, %21, %6 : tensor<8x!tt.ptr, #blocked> loc(#loc19)\n tt.return loc(#loc20)\n } loc(#loc)\n} loc(#loc)\n#loc1 = loc(unknown)\n#loc2 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":97:24)\n#loc3 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":98:20)\n#loc4 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":98:46)\n#loc5 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":98:33)\n#loc6 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":99:21)\n#loc7 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":101:24)\n#loc8 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":101:16)\n#loc9 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":102:24)\n#loc10 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":102:16)\n#loc11 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":103:24)\n#loc12 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":103:16)\n#loc13 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":105:17)\n#loc14 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":105:21)\n#loc15 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":105:36)\n#loc16 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":107:35)\n#loc17 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":107:46)\n#loc18 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":109:26)\n#loc19 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":109:35)\n#loc20 = loc(\"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\":109:4)\n","fused_op_kernel.llir":"; ModuleID = 'LLVMDialectModule'\nsource_filename = \"LLVMDialectModule\"\ntarget datalayout = \"e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64\"\n\ndefine ptx_kernel void @fused_op_kernel(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, float %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !5 {\n %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8\n %9 = shl i32 %8, 3, !dbg !9\n %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10\n %11 = and i32 %10, 7, !dbg !10\n %12 = or disjoint i32 %9, %11, !dbg !11\n %13 = icmp slt i32 %12, %4, !dbg !12\n %14 = sext i32 %12 to i64, !dbg !13\n %15 = getelementptr float, ptr addrspace(1) %0, i64 %14, !dbg !13\n %16 = tail call i32 asm sideeffect \"mov.u32 $0, 0x0;\\0A\\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];\", \"=r,l,b\"(ptr addrspace(1) %15, i1 %13) #2, !dbg !14\n %17 = bitcast i32 %16 to float, !dbg !14\n %18 = getelementptr float, ptr addrspace(1) %1, i64 %14, !dbg !15\n %19 = tail call i32 asm sideeffect \"mov.u32 $0, 0x0;\\0A\\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];\", \"=r,l,b\"(ptr addrspace(1) %18, i1 %13) #2, !dbg !16\n %20 = bitcast i32 %19 to float, !dbg !16\n %21 = getelementptr float, ptr addrspace(1) %2, i64 %14, !dbg !17\n %22 = tail call i32 asm sideeffect \"mov.u32 $0, 0x0;\\0A\\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];\", \"=r,l,b\"(ptr addrspace(1) %21, i1 %13) #2, !dbg !18\n %23 = bitcast i32 %22 to float, !dbg !18\n %24 = fmul float %17, %20, !dbg !19\n %25 = fmul float %5, %24, !dbg !20\n %26 = fadd float %25, %23, !dbg !21\n %27 = fcmp ogt float %26, 0.000000e+00, !dbg !22\n %28 = select i1 %27, float %26, float 0.000000e+00, !dbg !23\n %29 = getelementptr float, ptr addrspace(1) %3, i64 %14, !dbg !24\n %30 = and i32 %10, 120, !dbg !25\n %31 = icmp eq i32 %30, 0, !dbg !25\n %32 = bitcast float %28 to i32, !dbg !25\n %33 = and i1 %31, %13, !dbg !25\n tail call void asm sideeffect \"@$2 st.global.b32 [ $1 + 0 ], { $0 };\", \"r,l,b\"(i32 %32, ptr addrspace(1) %29, i1 %33) #2, !dbg !25\n ret void, !dbg !26\n}\n\n; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1\n\n; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)\ndeclare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1\n\nattributes #0 = { \"nvvm.reqntid\"=\"128\" }\nattributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }\nattributes #2 = { nounwind }\n\n!llvm.dbg.cu = !{!0}\n!llvm.module.flags = !{!2, !3}\n!llvm.ident = !{!4}\n\n!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: \"triton\", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)\n!1 = !DIFile(filename: \"test_complex_kernels.py\", directory: \"/scratch/findhao/tritonparse/tests\")\n!2 = !{i32 2, !\"Debug Info Version\", i32 3}\n!3 = !{i32 4, !\"nvvm-reflect-ftz\", i32 1}\n!4 = !{!\"clang version 3.8.0 (tags/RELEASE_380/final)\"}\n!5 = distinct !DISubprogram(name: \"fused_op_kernel\", linkageName: \"fused_op_kernel\", scope: !1, file: !1, line: 90, type: !6, scopeLine: 90, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)\n!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)\n!7 = !{}\n!8 = !DILocation(line: 97, column: 24, scope: !5)\n!9 = !DILocation(line: 98, column: 20, scope: !5)\n!10 = !DILocation(line: 98, column: 46, scope: !5)\n!11 = !DILocation(line: 98, column: 33, scope: !5)\n!12 = !DILocation(line: 99, column: 21, scope: !5)\n!13 = !DILocation(line: 101, column: 24, scope: !5)\n!14 = !DILocation(line: 101, column: 16, scope: !5)\n!15 = !DILocation(line: 102, column: 24, scope: !5)\n!16 = !DILocation(line: 102, column: 16, scope: !5)\n!17 = !DILocation(line: 103, column: 24, scope: !5)\n!18 = !DILocation(line: 103, column: 16, scope: !5)\n!19 = !DILocation(line: 105, column: 17, scope: !5)\n!20 = !DILocation(line: 105, column: 21, scope: !5)\n!21 = !DILocation(line: 105, column: 36, scope: !5)\n!22 = !DILocation(line: 107, column: 35, scope: !5)\n!23 = !DILocation(line: 107, column: 46, scope: !5)\n!24 = !DILocation(line: 109, column: 26, scope: !5)\n!25 = !DILocation(line: 109, column: 35, scope: !5)\n!26 = !DILocation(line: 109, column: 4, scope: !5)\n","fused_op_kernel.ptx":"//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 8.7\n.target sm_75\n.address_size 64\n\n\t// .globl\tfused_op_kernel // -- Begin function fused_op_kernel\n // @fused_op_kernel\n.visible .entry fused_op_kernel(\n\t.param .u64 .ptr .global .align 1 fused_op_kernel_param_0,\n\t.param .u64 .ptr .global .align 1 fused_op_kernel_param_1,\n\t.param .u64 .ptr .global .align 1 fused_op_kernel_param_2,\n\t.param .u64 .ptr .global .align 1 fused_op_kernel_param_3,\n\t.param .u32 fused_op_kernel_param_4,\n\t.param .f32 fused_op_kernel_param_5,\n\t.param .u64 .ptr .global .align 1 fused_op_kernel_param_6\n)\n.reqntid 128\n{\n\t.reg .pred \t%p<6>;\n\t.reg .b32 \t%r<15>;\n\t.reg .b64 \t%rd<10>;\n\t.loc\t1 90 0 // test_complex_kernels.py:90:0\n$L__func_begin0:\n\t.loc\t1 90 0 // test_complex_kernels.py:90:0\n\n// %bb.0:\n\tld.param.b64 \t%rd5, [fused_op_kernel_param_0];\n\tld.param.b64 \t%rd6, [fused_op_kernel_param_1];\n$L__tmp0:\n\t.loc\t1 97 24 // test_complex_kernels.py:97:24\n\tmov.u32 \t%r5, %ctaid.x;\n\t.loc\t1 98 20 // test_complex_kernels.py:98:20\n\tshl.b32 \t%r6, %r5, 3;\n\tld.param.b64 \t%rd7, [fused_op_kernel_param_2];\n\tld.param.b64 \t%rd8, [fused_op_kernel_param_3];\n\t.loc\t1 98 46 // test_complex_kernels.py:98:46\n\tmov.u32 \t%r7, %tid.x;\n\tand.b32 \t%r8, %r7, 7;\n\tld.param.b32 \t%r9, [fused_op_kernel_param_4];\n\t.loc\t1 98 33 // test_complex_kernels.py:98:33\n\tor.b32 \t%r10, %r6, %r8;\n\tld.param.b32 \t%r11, [fused_op_kernel_param_5];\n\t.loc\t1 99 21 // test_complex_kernels.py:99:21\n\tsetp.lt.s32 \t%p1, %r10, %r9;\n\t.loc\t1 101 24 // test_complex_kernels.py:101:24\n\tmul.wide.s32 \t%rd9, %r10, 4;\n\tadd.s64 \t%rd1, %rd5, %rd9;\n\t.loc\t1 101 16 // test_complex_kernels.py:101:16\n\t// begin inline asm\n\tmov.u32 %r1, 0x0;\n\t@%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ];\n\t// end inline asm\n\t.loc\t1 102 24 // test_complex_kernels.py:102:24\n\tadd.s64 \t%rd2, %rd6, %rd9;\n\t.loc\t1 102 16 // test_complex_kernels.py:102:16\n\t// begin inline asm\n\tmov.u32 %r2, 0x0;\n\t@%p1 ld.global.b32 { %r2 }, [ %rd2 + 0 ];\n\t// end inline asm\n\t.loc\t1 103 24 // test_complex_kernels.py:103:24\n\tadd.s64 \t%rd3, %rd7, %rd9;\n\t.loc\t1 103 16 // test_complex_kernels.py:103:16\n\t// begin inline asm\n\tmov.u32 %r3, 0x0;\n\t@%p1 ld.global.b32 { %r3 }, [ %rd3 + 0 ];\n\t// end inline asm\n\t.loc\t1 105 17 // test_complex_kernels.py:105:17\n\tmul.f32 \t%r12, %r1, %r2;\n\t.loc\t1 105 36 // test_complex_kernels.py:105:36\n\tfma.rn.f32 \t%r13, %r11, %r12, %r3;\n\t.loc\t1 107 46 // test_complex_kernels.py:107:46\n\tmax.f32 \t%r4, %r13, 0f00000000;\n\t.loc\t1 109 26 // test_complex_kernels.py:109:26\n\tadd.s64 \t%rd4, %rd8, %rd9;\n\t.loc\t1 109 35 // test_complex_kernels.py:109:35\n\tand.b32 \t%r14, %r7, 120;\n\tsetp.eq.s32 \t%p5, %r14, 0;\n\tand.pred \t%p4, %p5, %p1;\n\t// begin inline asm\n\t@%p4 st.global.b32 [ %rd4 + 0 ], { %r4 };\n\t// end inline asm\n\t.loc\t1 109 4 // test_complex_kernels.py:109:4\n\tret;\n$L__tmp1:\n$L__func_end0:\n // -- End function\n}\n\t.file\t1 \"/scratch/findhao/tritonparse/tests/test_complex_kernels.py\"\n\t.section\t.debug_abbrev\n\t{\n.b8 1 // Abbreviation Code\n.b8 17 // DW_TAG_compile_unit\n.b8 0 // DW_CHILDREN_no\n.b8 37 // DW_AT_producer\n.b8 8 // DW_FORM_string\n.b8 19 // DW_AT_language\n.b8 5 // DW_FORM_data2\n.b8 3 // DW_AT_name\n.b8 8 // DW_FORM_string\n.b8 16 // DW_AT_stmt_list\n.b8 6 // DW_FORM_data4\n.b8 27 // DW_AT_comp_dir\n.b8 8 // DW_FORM_string\n.b8 0 // EOM(1)\n.b8 0 // EOM(2)\n.b8 0 // EOM(3)\n\t}\n\t.section\t.debug_info\n\t{\n.b32 80 // Length of Unit\n.b8 2 // DWARF version number\n.b8 0\n.b32 .debug_abbrev // Offset Into Abbrev. Section\n.b8 8 // Address Size (in bytes)\n.b8 1 // Abbrev [1] 0xb:0x49 DW_TAG_compile_unit\n.b8 116 // DW_AT_producer\n.b8 114\n.b8 105\n.b8 116\n.b8 111\n.b8 110\n.b8 0\n.b8 2 // DW_AT_language\n.b8 0\n.b8 116 // DW_AT_name\n.b8 101\n.b8 115\n.b8 116\n.b8 95\n.b8 99\n.b8 111\n.b8 109\n.b8 112\n.b8 108\n.b8 101\n.b8 120\n.b8 95\n.b8 107\n.b8 101\n.b8 114\n.b8 110\n.b8 101\n.b8 108\n.b8 115\n.b8 46\n.b8 112\n.b8 121\n.b8 0\n.b32 .debug_line // DW_AT_stmt_list\n.b8 47 // DW_AT_comp_dir\n.b8 115\n.b8 99\n.b8 114\n.b8 97\n.b8 116\n.b8 99\n.b8 104\n.b8 47\n.b8 102\n.b8 105\n.b8 110\n.b8 100\n.b8 104\n.b8 97\n.b8 111\n.b8 47\n.b8 116\n.b8 114\n.b8 105\n.b8 116\n.b8 111\n.b8 110\n.b8 112\n.b8 97\n.b8 114\n.b8 115\n.b8 101\n.b8 47\n.b8 116\n.b8 101\n.b8 115\n.b8 116\n.b8 115\n.b8 0\n\t}\n\t.section\t.debug_macinfo\t{\t}\n","fused_op_kernel.json":"{\"hash\": \"b04777ad4c88dc1395109fc4d1af29a6569897496e3e3571dffac94716e9f449\", \"target\": {\"backend\": \"cuda\", \"arch\": 75, \"warp_size\": 32}, \"num_warps\": 4, \"num_ctas\": 1, \"num_stages\": 3, \"warp_size\": 32, \"maxnreg\": null, \"cluster_dims\": [1, 1, 1], \"ptx_version\": null, \"ptx_options\": null, \"ir_override\": null, \"enable_fp_fusion\": true, \"launch_cooperative_grid\": false, \"launch_pdl\": false, \"supported_fp8_dtypes\": [\"fp8e4b15\", \"fp8e5\"], \"deprecated_fp8_dot_operand_dtypes\": [], \"default_dot_input_precision\": \"tf32\", \"allowed_dot_input_precisions\": [\"tf32\", \"tf32x3\", \"ieee\"], \"max_num_imprecise_acc_default\": 0, \"extern_libs\": [[\"libdevice\", \"/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc\"]], \"debug\": false, \"backend_name\": \"cuda\", \"sanitize_overflow\": true, \"arch\": \"sm75\", \"triton_version\": \"3.4.0\", \"tensordesc_meta\": [], \"shared\": 0, \"tmem_size\": 0, \"global_scratch_size\": 0, \"global_scratch_align\": 1, \"name\": \"fused_op_kernel\"}"},"python_source":{"file_path":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","start_line":89,"end_line":110,"code":"@triton.jit\ndef fused_op_kernel(\n a_ptr, b_ptr, c_ptr, output_ptr,\n n_elements,\n scale_factor: float,\n ACTIVATION: tl.constexpr,\n BLOCK_SIZE: tl.constexpr,\n):\n pid = tl.program_id(axis=0)\n offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)\n mask = offsets < n_elements\n\n a = tl.load(a_ptr + offsets, mask=mask)\n b = tl.load(b_ptr + offsets, mask=mask)\n c = tl.load(c_ptr + offsets, mask=mask)\n\n result = a * b * scale_factor + c\n if ACTIVATION == \"relu\":\n result = tl.where(result > 0, result, 0.0)\n \n tl.store(output_ptr + offsets, result, mask=mask)\n"},"times":{"ir_initialization":713,"lowering_stages":[],"store_results":0}}} +{"event_type":"launch","pid":171439,"name":"fused_op_kernel","function":null,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"b04777ad4c88dc1395109fc4d1af29a6569897496e3e3571dffac94716e9f449","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"fused_op_kernel","num_ctas":1,"num_stages":3,"num_warps":4,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":0,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[8],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":8,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":32,"data_ptr":"0x7e744ac02000"},"b_ptr":{"type":"tensor","shape":[8],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":8,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":32,"data_ptr":"0x7e744ac02200"},"c_ptr":{"type":"tensor","shape":[8],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":8,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":32,"data_ptr":"0x7e744ac02400"},"output_ptr":{"type":"tensor","shape":[8],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":8,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":32,"data_ptr":"0x7e744ac02a00"},"n_elements":{"type":"int","value":8},"scale_factor":{"type":"float","value":1.0},"ACTIVATION":{"type":"str","value":"relu","length":4},"BLOCK_SIZE":{"type":"int","value":8}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":165,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"out3 = fused_op(x, y, z, scale_factor=1.0, activation=\"relu\")"},{"line":116,"name":"fused_op","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"fused_op_kernel[grid](a, b, c, output, n_elements, scale_factor, ACTIVATION=activation, BLOCK_SIZE=BLOCK_SIZE)"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} +{"event_type":"launch","pid":171439,"name":"fused_op_kernel","function":98431773370048,"stream":0,"grid":[1],"compilation_metadata":{"allowed_dot_input_precisions":["tf32","tf32x3","ieee"],"arch":"sm75","backend_name":"cuda","cluster_dims":[1,1,1],"debug":false,"default_dot_input_precision":"tf32","deprecated_fp8_dot_operand_dtypes":[],"enable_fp_fusion":true,"extern_libs":[["libdevice","/scratch/findhao/pta/triton/python/triton/backends/nvidia/lib/libdevice.10.bc"]],"global_scratch_align":1,"global_scratch_size":0,"hash":"b04777ad4c88dc1395109fc4d1af29a6569897496e3e3571dffac94716e9f449","ir_override":null,"launch_cooperative_grid":false,"launch_pdl":false,"max_num_imprecise_acc_default":0,"maxnreg":null,"name":"fused_op_kernel","num_ctas":1,"num_stages":3,"num_warps":4,"ptx_options":null,"ptx_version":null,"sanitize_overflow":true,"shared":0,"supported_fp8_dtypes":["fp8e4b15","fp8e5"],"target":{"backend":"cuda","arch":75,"warp_size":32},"tensordesc_meta":[],"tmem_size":0,"triton_version":"3.4.0","warp_size":32},"extracted_args":{"a_ptr":{"type":"tensor","shape":[6],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":6,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":24,"data_ptr":"0x7e744ac02c00"},"b_ptr":{"type":"tensor","shape":[6],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":6,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":24,"data_ptr":"0x7e744ac02e00"},"c_ptr":{"type":"tensor","shape":[6],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":6,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":24,"data_ptr":"0x7e744ac03000"},"output_ptr":{"type":"tensor","shape":[6],"dtype":"torch.float32","device":"cuda:0","stride":[1],"numel":6,"is_contiguous":true,"element_size":4,"storage_offset":0,"memory_usage":24,"data_ptr":"0x7e744ac03200"},"n_elements":{"type":"int","value":6},"scale_factor":{"type":"float","value":1.0},"ACTIVATION":{"type":"str","value":"relu","length":4},"BLOCK_SIZE":{"type":"int","value":8}},"stack":[{"line":179,"name":"","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"test_complex_kernels()"},{"line":173,"name":"test_complex_kernels","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"out4 = fused_op(x_large, y_large, z_large, scale_factor=1.0, activation=\"relu\")"},{"line":116,"name":"fused_op","filename":"/scratch/findhao/tritonparse/tests/test_complex_kernels.py","loc":"fused_op_kernel[grid](a, b, c, output, n_elements, scale_factor, ACTIVATION=activation, BLOCK_SIZE=BLOCK_SIZE)"},{"line":393,"name":"","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)"},{"line":617,"name":"run","filename":"/scratch/findhao/pta/triton/python/triton/runtime/jit.py","loc":"kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,"},{"line":652,"name":"__call__","filename":"/scratch/findhao/pta/triton/python/triton/backends/nvidia/driver.py","loc":"self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,"},{"line":938,"name":"__call__","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"trace_structured_triton(\"launch\", metadata_fn=lambda: convert(trace_data))"},{"line":666,"name":"trace_structured_triton","filename":"/scratch/findhao/tritonparse/tritonparse/structured_logging.py","loc":"metadata_dict[\"stack\"] = get_stack_trace()"}],"timestamp":"2025-07-20T01:15:07.%fZ"} diff --git a/tests/example_output/parsed_output_complex/dedicated_log_triton_trace_findhao__mapped.ndjson.gz b/tests/example_output/parsed_output_complex/dedicated_log_triton_trace_findhao__mapped.ndjson.gz new file mode 100644 index 0000000..7889849 Binary files /dev/null and b/tests/example_output/parsed_output_complex/dedicated_log_triton_trace_findhao__mapped.ndjson.gz differ diff --git a/tests/example_output/parsed_output_complex/log_file_list.json b/tests/example_output/parsed_output_complex/log_file_list.json new file mode 100644 index 0000000..d9f057a --- /dev/null +++ b/tests/example_output/parsed_output_complex/log_file_list.json @@ -0,0 +1,8 @@ +{ + "tritonparse_url_prefix": "", + "rank_default": { + "regular_files": [], + "mapped_file": "dedicated_log_triton_trace_findhao__mapped.ndjson.gz", + "rank_suffix": "" + } +} \ No newline at end of file diff --git a/tests/test_complex_kernels.py b/tests/test_complex_kernels.py new file mode 100644 index 0000000..389d8a4 --- /dev/null +++ b/tests/test_complex_kernels.py @@ -0,0 +1,256 @@ +""" +A more complex test case involving two distinct Triton kernels, one of which uses autotuning. +This test is designed to validate the launch_diff functionality with multiple, varied launches. + +Test Plan: +``` +TORCHINDUCTOR_FX_GRAPH_CACHE=0 TRITONPARSE_DEBUG=1 python tests/test_complex_kernels.py +``` +""" + +import os + +import torch +import triton +import triton.language as tl + +import tritonparse.structured_logging +import tritonparse.utils + +# Initialize logging +log_path = "./logs_complex" +tritonparse.structured_logging.init(log_path, enable_trace_launch=True) + +os.environ["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "0" + + +# Kernel 1: Autotuned Matmul (simplified configs for small scale) +@triton.autotune( + configs=[ + triton.Config( + { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 16, + "GROUP_SIZE_M": 1, + }, + num_stages=1, + num_warps=1, + ), + triton.Config( + { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 16, + "GROUP_SIZE_M": 1, + }, + num_stages=1, + num_warps=1, + ), + triton.Config( + { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 16, + "GROUP_SIZE_M": 1, + }, + num_stages=1, + num_warps=1, + ), + ], + key=["M", "N", "K"], +) +@triton.jit +def matmul_kernel( + a, + b, + c, + M, + N, + K, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + ACTIVATION: tl.constexpr, +): + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size) + pid_n = (pid % num_pid_in_group) // group_size + + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) + b_ptrs = b + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a_block = tl.load( + a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0 + ) + b_block = tl.load( + b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0 + ) + accumulator += tl.dot(a_block, b_block) + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + c_block = accumulator.to(tl.float16) + + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c_block, mask=c_mask) + + +def matmul(a, b): + assert a.shape[1] == b.shape[0], "Incompatible dimensions" + M, K = a.shape + K, N = b.shape + c = torch.empty((M, N), device=a.device, dtype=a.dtype) + + def grid(META): + return ( + triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), + ) + + matmul_kernel[grid]( + a, + b, + c, + M, + N, + K, + a.stride(0), + a.stride(1), + b.stride(0), + b.stride(1), + c.stride(0), + c.stride(1), + ACTIVATION=None, + ) + return c + + +# Kernel 2: Fused element-wise operation +@triton.jit +def fused_op_kernel( + a_ptr, + b_ptr, + c_ptr, + output_ptr, + n_elements, + scale_factor: float, + ACTIVATION: tl.constexpr, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(axis=0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + + a = tl.load(a_ptr + offsets, mask=mask) + b = tl.load(b_ptr + offsets, mask=mask) + c = tl.load(c_ptr + offsets, mask=mask) + + result = a * b * scale_factor + c + if ACTIVATION == "relu": + result = tl.where(result > 0, result, 0.0) + + tl.store(output_ptr + offsets, result, mask=mask) + + +def fused_op(a, b, c, scale_factor: float, activation: str): + n_elements = a.numel() + output = torch.empty_like(a) + BLOCK_SIZE = 8 # Reduced from 1024 for small scale testing + grid = (triton.cdiv(n_elements, BLOCK_SIZE),) + fused_op_kernel[grid]( + a, + b, + c, + output, + n_elements, + scale_factor, + ACTIVATION=activation, + BLOCK_SIZE=BLOCK_SIZE, + ) + return output + + +def test_complex_kernels(): + """Main test function to run both kernels with varied parameters.""" + torch.manual_seed(0) + + # --- Matmul Launches (3 times with different configs) --- + print("--- Testing Matmul Kernel (3 launches) ---") + # Launch 1 + a1 = torch.randn((16, 16), device="cuda", dtype=torch.float16) + b1 = torch.randn((16, 16), device="cuda", dtype=torch.float16) + c1 = matmul(a1, b1) + c1.sum() # Synchronize + print("Matmul Launch 1 (16x16 @ 16x16) done.") + + # Launch 2 + a2 = torch.randn((32, 16), device="cuda", dtype=torch.float16) + b2 = torch.randn((16, 32), device="cuda", dtype=torch.float16) + c2 = matmul(a2, b2) + c2.sum() # Synchronize + print("Matmul Launch 2 (32x16 @ 16x32) done.") + + # Launch 3 + a3 = torch.randn((16, 32), device="cuda", dtype=torch.float16) + b3 = torch.randn((32, 16), device="cuda", dtype=torch.float16) + c3 = matmul(a3, b3) + c3.sum() # Synchronize + print("Matmul Launch 3 (16x32 @ 32x16) done.") + + # --- Fused Op Launches (4 times with different parameters) --- + print("\n--- Testing Fused Op Kernel (4 launches) ---") + x = torch.randn((8,), device="cuda", dtype=torch.float32) + y = torch.randn((8,), device="cuda", dtype=torch.float32) + z = torch.randn((8,), device="cuda", dtype=torch.float32) + + # Launch 1 + print("Fused Op Launch 1: scale=1.0, activation=None") + out1 = fused_op(x, y, z, scale_factor=1.0, activation="none") + out1.sum() # Synchronize + + # Launch 2 + print("Fused Op Launch 2: scale=2.5, activation=None") + out2 = fused_op(x, y, z, scale_factor=2.5, activation="none") + out2.sum() # Synchronize + + # Launch 3 + print("Fused Op Launch 3: scale=1.0, activation='relu'") + out3 = fused_op(x, y, z, scale_factor=1.0, activation="relu") + out3.sum() # Synchronize + + # Launch 4 (different size) + print("Fused Op Launch 4: scale=1.0, activation='relu', different size") + x_large = torch.randn((6,), device="cuda", dtype=torch.float32) + y_large = torch.randn((6,), device="cuda", dtype=torch.float32) + z_large = torch.randn((6,), device="cuda", dtype=torch.float32) + out4 = fused_op(x_large, y_large, z_large, scale_factor=1.0, activation="relu") + out4.sum() # Synchronize + print("All kernels executed.") + + +if __name__ == "__main__": + test_complex_kernels() + # Use unified_parse to process the generated logs + tritonparse.utils.unified_parse( + source=log_path, out="./parsed_output_complex", overwrite=True + ) diff --git a/tritonparse/extract_source_mappings.py b/tritonparse/extract_source_mappings.py index 194f4c9..a3fd972 100644 --- a/tritonparse/extract_source_mappings.py +++ b/tritonparse/extract_source_mappings.py @@ -449,80 +449,80 @@ def parse_single_trace_content(trace_content: str) -> str: """ entry = json.loads(trace_content) - if entry.get("event_type") != "compilation" and "payload" in entry: - logger.warning("Not a compilation event. Skipping.") - return "" - payload = entry.setdefault("payload", {}) - file_content = payload.get("file_content", {}) - file_path = payload.get("file_path", {}) - - # Find the IR file keys - ttir_key = next((k for k in file_content if k.endswith(".ttir")), None) - ttgir_key = next((k for k in file_content if k.endswith(".ttgir")), None) - ptx_key = next((k for k in file_content if k.endswith(".ptx")), None) - amdgcn_key = next((k for k in file_content if k.endswith(".amdgcn")), None) - # Skip if no IR files found - if not (ttir_key or ttgir_key or ptx_key or amdgcn_key): - logger.warning("No IR files found in the payload.") - return trace_content - - # generate ttir->source, ttgir->source, ptx->source - ttir_map = process_ir(ttir_key, file_content, file_path) - ttgir_map = process_ir(ttgir_key, file_content, file_path) - ptx_map = process_ir(ptx_key, file_content, file_path, [ttir_map, ttgir_map]) - amdgcn_map = process_ir(amdgcn_key, file_content, file_path, [ttir_map, ttgir_map]) - - # Create bidirectional mappings between all IR types - ir_maps = { - "ttir": ttir_map, - "ttgir": ttgir_map, - "ptx": ptx_map, - "amdgcn": amdgcn_map, - } - - # Create mappings between all pairs of IR types - ir_types = list(ir_maps.keys()) - for i, src_type in enumerate(ir_types): - for tgt_type in ir_types[i + 1 :]: - if ir_maps[src_type] and ir_maps[tgt_type]: - create_bidirectional_mapping( - ir_maps[src_type], ir_maps[tgt_type], src_type, tgt_type - ) - logger.debug( - f"Created bidirectional mapping between {src_type} and {tgt_type}" - ) - - py_map = {} - - if "python_source" in payload: - logger.debug( - f"Added Python source information (lines {payload['python_source']['start_line']}-{payload['python_source']['end_line']})" + if entry.get("event_type") == "compilation": + payload = entry.setdefault("payload", {}) + file_content = payload.get("file_content", {}) + file_path = payload.get("file_path", {}) + + # Find the IR file keys + ttir_key = next((k for k in file_content if k.endswith(".ttir")), None) + ttgir_key = next((k for k in file_content if k.endswith(".ttgir")), None) + ptx_key = next((k for k in file_content if k.endswith(".ptx")), None) + amdgcn_key = next((k for k in file_content if k.endswith(".amdgcn")), None) + # Skip if no IR files found + if not (ttir_key or ttgir_key or ptx_key or amdgcn_key): + logger.warning("No IR files found in the payload.") + return trace_content + + # generate ttir->source, ttgir->source, ptx->source + ttir_map = process_ir(ttir_key, file_content, file_path) + ttgir_map = process_ir(ttgir_key, file_content, file_path) + ptx_map = process_ir(ptx_key, file_content, file_path, [ttir_map, ttgir_map]) + amdgcn_map = process_ir( + amdgcn_key, file_content, file_path, [ttir_map, ttgir_map] ) - # 4. Create Python source to IR mappings. We use the original line numbers as key in the python source code. - # Create a list of valid IR mappings, filtering out None keys - ir_mappings = [] - ir_keys_and_maps = [ - (ttir_key, ttir_map), - (ttgir_key, ttgir_map), - (ptx_key, ptx_map), - (amdgcn_key, amdgcn_map), - ] - - for key, mapping in ir_keys_and_maps: - if key: - ir_mappings.append((get_file_extension(key), mapping)) - - py_map = create_python_mapping(ir_mappings) - - # Store the mappings in the payload - payload["source_mappings"] = { - "ttir": ttir_map, - "ttgir": ttgir_map, - **({"ptx": ptx_map} if ptx_map else {}), - **({"amdgcn": amdgcn_map} if amdgcn_map else {}), - "python": py_map, - } + # Create bidirectional mappings between all IR types + ir_maps = { + "ttir": ttir_map, + "ttgir": ttgir_map, + "ptx": ptx_map, + "amdgcn": amdgcn_map, + } + + # Create mappings between all pairs of IR types + ir_types = list(ir_maps.keys()) + for i, src_type in enumerate(ir_types): + for tgt_type in ir_types[i + 1 :]: + if ir_maps[src_type] and ir_maps[tgt_type]: + create_bidirectional_mapping( + ir_maps[src_type], ir_maps[tgt_type], src_type, tgt_type + ) + logger.debug( + f"Created bidirectional mapping between {src_type} and {tgt_type}" + ) + + py_map = {} + + if "python_source" in payload: + logger.debug( + f"Added Python source information (lines {payload['python_source']['start_line']}-{payload['python_source']['end_line']})" + ) + + # 4. Create Python source to IR mappings. We use the original line numbers as key in the python source code. + # Create a list of valid IR mappings, filtering out None keys + ir_mappings = [] + ir_keys_and_maps = [ + (ttir_key, ttir_map), + (ttgir_key, ttgir_map), + (ptx_key, ptx_map), + (amdgcn_key, amdgcn_map), + ] + + for key, mapping in ir_keys_and_maps: + if key: + ir_mappings.append((get_file_extension(key), mapping)) + + py_map = create_python_mapping(ir_mappings) + + # Store the mappings in the payload + payload["source_mappings"] = { + "ttir": ttir_map, + "ttgir": ttgir_map, + **({"ptx": ptx_map} if ptx_map else {}), + **({"amdgcn": amdgcn_map} if amdgcn_map else {}), + "python": py_map, + } # NDJSON format requires a newline at the end of each line return json.dumps(entry, separators=(",", ":")) + "\n" @@ -533,90 +533,134 @@ def parse_single_file( split_by_frame_id_and_compile_id: bool = True, ): """ - Process a single file and extract source code mappings. + Process a single file, correctly group events by kernel, and extract mappings. - This function takes a file path as input, reads the file content, and extracts - source code mappings from Triton trace JSON files. It processes each line of the file, - parses the trace content to extract IR mappings, and writes the updated content - to output files. + This function reads a trace file, groups compilation and launch events by + their kernel hash, generates a launch_diff event for each kernel, and writes + the processed data to output files. Args: file_path (str): The path to the file to be processed. - output_dir (str, optional): Directory to save the output files with mappings. - split_by_frame_id_and_compile_id (bool, optional): Whether to split output files - by frame_id and compile_id. Defaults to True. - - Returns: - None. The function writes the processed data to files in the output_dir. - Each output file will contain the original trace data enriched with source mappings - in NDJSON format (one JSON object per line). + output_dir (str, optional): Directory to save the output files. + split_by_frame_id_and_compile_id (bool, optional): Whether to split + output files by frame_id and compile_id. Defaults to True. """ - outputs = defaultdict(list) + kernels_by_hash = defaultdict( + lambda: {"compilation": None, "launches": [], "output_file": None} + ) - # Set default output directory if not provided output_dir = output_dir or os.path.dirname(file_path) - - # Check if input file is compressed based on file extension is_compressed_input = file_path.endswith(".bin.ndjson") - - # Open file in appropriate mode - use gzip.open for compressed files - if is_compressed_input: - # Use gzip.open which automatically handles member concatenation - file_handle = gzip.open(file_path, "rt", encoding="utf-8") - else: - file_handle = open(file_path, "r") + file_handle = ( + gzip.open(file_path, "rt", encoding="utf-8") + if is_compressed_input + else open(file_path, "r") + ) with file_handle as f: file_name = os.path.basename(file_path) - # Handle .bin.ndjson extension properly - if is_compressed_input: - file_name_without_extension = file_name[:-11] # Remove .bin.ndjson - else: - file_name_without_extension = os.path.splitext(file_name)[0] + file_name_without_extension = ( + file_name[:-11] if is_compressed_input else os.path.splitext(file_name)[0] + ) - # Process lines uniformly for both compressed and uncompressed files for i, line in enumerate(f): logger.debug(f"Processing line {i + 1} in {file_path}") - json_str = line.strip() if not json_str: continue - parsed_line = parse_single_trace_content(json_str) - if not parsed_line: - logger.warning(f"Failed to parse line {i + 1} in {file_path}") + # We don't need to generate full mappings for every line here, + # just enough to get the event type and necessary IDs. + try: + parsed_json = json.loads(json_str) + except json.JSONDecodeError: + logger.warning(f"Failed to parse JSON on line {i + 1} in {file_path}") continue - parsed_json = json.loads(parsed_line) - payload = parsed_json.get("payload", None) - if split_by_frame_id_and_compile_id: - if not payload: - logger.warning("No payload found in the parsed JSON.") + event_type = parsed_json.get("event_type", None) + payload = parsed_json.get("payload", {}) + + if event_type == "compilation": + kernel_hash = payload.get("metadata", {}).get("hash") + if not kernel_hash: continue - pt_info = payload.get("pt_info", {}) - frame_id = pt_info.get("frame_id", None) - frame_compile_id = pt_info.get("frame_compile_id", None) - compiled_autograd_id = pt_info.get("compiled_autograd_id", "-") - attempt_id = pt_info.get("attempt_id", 0) - output_file_name = "" - if frame_id is not None or frame_compile_id is not None: - output_file_name = f"f{frame_id}_fc{frame_compile_id}_a{attempt_id}_cai{compiled_autograd_id}.ndjson" + + if split_by_frame_id_and_compile_id: + pt_info = payload.get("pt_info", {}) + frame_id = pt_info.get("frame_id") + frame_compile_id = pt_info.get("frame_compile_id") + attempt_id = pt_info.get("attempt_id", 0) + cai = pt_info.get("compiled_autograd_id", "-") + if frame_id is not None or frame_compile_id is not None: + fname = f"f{frame_id}_fc{frame_compile_id}_a{attempt_id}_cai{cai}.ndjson" + else: + fname = f"{file_name_without_extension}_mapped.ndjson" else: - logger.debug( - "No frame_id or frame_compile_id found in the payload." + fname = f"{file_name_without_extension}_mapped.ndjson" + + output_file = os.path.join(output_dir, fname) + # The full processing is deferred until the final write. + kernels_by_hash[kernel_hash]["compilation"] = json_str + kernels_by_hash[kernel_hash]["output_file"] = output_file + + elif event_type == "launch": + kernel_hash = parsed_json.get("compilation_metadata", {}).get("hash") + if kernel_hash: + kernels_by_hash[kernel_hash]["launches"].append( + (parsed_json, i + 1) ) - output_file_name = f"{file_name_without_extension}_mapped.ndjson" - else: - output_file_name = f"{file_name_without_extension}_mapped.ndjson" - output_file = os.path.join(output_dir, output_file_name) - outputs[output_file].append(parsed_line) - logger.debug(f"output file: {output_file}") + + # Organize lines for final output, keyed by output file path + all_output_lines = defaultdict(list) + for _kernel_hash, data in kernels_by_hash.items(): + compilation_json_str = data["compilation"] + launches_with_indices = data["launches"] + output_file = data["output_file"] + + if not output_file: + logger.warning(f"No output file for kernel hash {_kernel_hash}, skipping.") + continue + + # Process the compilation event now to include source mappings + if compilation_json_str: + processed_compilation_line = parse_single_trace_content( + compilation_json_str + ) + all_output_lines[output_file].append(processed_compilation_line) + compilation_event = json.loads(processed_compilation_line) + else: + compilation_event = None + + for launch_event, _ in launches_with_indices: + all_output_lines[output_file].append( + json.dumps(launch_event, separators=(",", ":")) + "\n" + ) + + if compilation_event and launches_with_indices: + sames, diffs, launch_index_map = _generate_launch_diff( + launches_with_indices + ) + launch_diff_event = { + "event_type": "launch_diff", + "hash": _kernel_hash, + "name": compilation_event.get("payload", {}) + .get("metadata", {}) + .get("name"), + "total_launches": len(launches_with_indices), + "launch_index_map": launch_index_map, + "diffs": diffs, + "sames": sames, + } + all_output_lines[output_file].append( + json.dumps(launch_diff_event, separators=(",", ":")) + "\n" + ) if not os.path.exists(output_dir): os.makedirs(output_dir) - for output_file, parsed_lines in outputs.items(): + + for output_file, final_lines in all_output_lines.items(): with open(output_file, "w") as out: - out.writelines(parsed_lines) + out.writelines(final_lines) def parse_args(): @@ -638,6 +682,178 @@ def parse_args(): return parser.parse_args() +# Fields that are expected to vary but are not useful to list out in the diff. +SUMMARY_FIELDS = ["pid", "timestamp", "stream", "function", "data_ptr"] + + +def _flatten_dict( + d: Dict[str, Any], parent_key: str = "", sep: str = "." +) -> Dict[str, Any]: + """ + Flattens a nested dictionary. + """ + items = [] + for k, v in d.items(): + new_key = parent_key + sep + k if parent_key else k + if isinstance(v, dict): + items.extend(_flatten_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) + + +def _unflatten_dict(d: Dict[str, Any], sep: str = ".") -> Dict[str, Any]: + """ + Unflattens a dictionary with delimited keys. + """ + result = {} + for key, value in d.items(): + parts = key.split(sep) + d_ref = result + for part in parts[:-1]: + if part not in d_ref: + d_ref[part] = {} + d_ref = d_ref[part] + d_ref[parts[-1]] = value + return result + + +def _to_ranges(indices: List[int]) -> List[Dict[str, int]]: + """ + Converts a sorted list of indices into a list of continuous ranges. + e.g., [0, 1, 2, 5, 6, 8] -> [{'start': 0, 'end': 2}, {'start': 5, 'end': 6}, {'start': 8, 'end': 8}] + """ + if not indices: + return [] + + indices = sorted(indices) + ranges = [] + start = indices[0] + end = indices[0] + + for i in range(1, len(indices)): + if indices[i] == end + 1: + end = indices[i] + else: + ranges.append({"start": start, "end": end}) + start = end = indices[i] + + ranges.append({"start": start, "end": end}) + return ranges + + +def _generate_launch_diff( + launches: List[Tuple[Dict[str, Any], int]], +) -> Tuple[Dict[str, Any], Dict[str, Any], List[Dict[str, int]]]: + """ + Compares a list of launch events and returns sames, diffs, and an index map. + """ + if not launches: + return {}, {}, [] + + launch_events = [launch[0] for launch in launches] + launch_index_map = [launch[1] for launch in launches] + + if len(launch_events) == 1: + return ( + _unflatten_dict(_flatten_dict(launch_events[0])), + {}, + _to_ranges(launch_index_map), + ) + + # Group values by key + data_by_key = defaultdict(lambda: defaultdict(list)) + for i, launch in enumerate(launch_events): + launch_flat = _flatten_dict(launch) + for key, value in launch_flat.items(): + # JSON doesn't support all Python types as values directly, str is safer + value_str = json.dumps(value, sort_keys=True) + data_by_key[key][value_str].append(i) + + sames_flat = {} + diffs_flat = {} + + for key, value_groups in data_by_key.items(): + if len(value_groups) == 1: + # This key has the same value across all launches + value_str = list(value_groups.keys())[0] + sames_flat[key] = json.loads(value_str) + else: + # This key has different values + is_summary = any(summary_key in key for summary_key in SUMMARY_FIELDS) + if is_summary: + diffs_flat[key] = { + "diff_type": "summary", + "summary_text": f"Varies across {len(value_groups)} unique values", + } + else: + values_dist = [] + for value_str, indices in value_groups.items(): + values_dist.append( + { + "value": json.loads(value_str), + "count": len(indices), + "launches": _to_ranges(indices), + } + ) + # Sort by first occurrence + values_dist.sort(key=lambda x: x["launches"][0]["start"]) + diffs_flat[key] = { + "diff_type": "distribution", + "values": values_dist, + } + + # Unflatten the results + sames_unflattened = _unflatten_dict(sames_flat) + diffs_unflattened = _unflatten_dict(diffs_flat) + + # Special handling for extracted_args to create argument_diff structures + if "extracted_args" in sames_unflattened or "extracted_args" in diffs_unflattened: + sames_args = sames_unflattened.pop("extracted_args", {}) + diffs_args_flat = diffs_unflattened.pop("extracted_args", {}) + + all_arg_names = set(sames_args.keys()) | set(diffs_args_flat.keys()) + + final_arg_diffs = {} + + for arg_name in all_arg_names: + if arg_name in diffs_args_flat: + # This argument has at least one differing sub-field. + arg_sames = {} + arg_diffs_internal = {} + + # Collect all sub-fields for this argument from the original data + all_sub_fields = set() + for launch in launch_events: + arg_data = launch.get("extracted_args", {}).get(arg_name, {}) + all_sub_fields.update(arg_data.keys()) + + for sub_field in all_sub_fields: + flat_key = f"extracted_args.{arg_name}.{sub_field}" + if flat_key in diffs_flat: + arg_diffs_internal[sub_field] = diffs_flat[flat_key] + elif flat_key in sames_flat: + arg_sames[sub_field] = sames_flat[flat_key] + + if arg_sames or arg_diffs_internal: + final_arg_diffs[arg_name] = { + "diff_type": "argument_diff", + "sames": arg_sames, + "diffs": arg_diffs_internal, + } + elif arg_name in sames_args: + # This argument is entirely the same across all launches. + # We move it back to the main sames dict for consistency. + if "extracted_args" not in sames_unflattened: + sames_unflattened["extracted_args"] = {} + sames_unflattened["extracted_args"][arg_name] = sames_args[arg_name] + + if final_arg_diffs: + diffs_unflattened["extracted_args"] = final_arg_diffs + + return sames_unflattened, diffs_unflattened, _to_ranges(launch_index_map) + + if __name__ == "__main__": args = parse_args() if args.input: diff --git a/tritonparse/structured_logging.py b/tritonparse/structured_logging.py index 76c6b6d..81711f9 100644 --- a/tritonparse/structured_logging.py +++ b/tritonparse/structured_logging.py @@ -156,6 +156,9 @@ def convert(obj): # 2. simple containers ---------------------------------------------------- if isinstance(obj, (list, tuple)): + # Handle namedtuple specially to preserve field names + if hasattr(obj, "_asdict"): + return convert(obj._asdict()) return [convert(x) for x in obj] if isinstance(obj, (set, frozenset)): @@ -810,7 +813,7 @@ def extract_arg_info(arg_dict): def add_launch_metadata(grid, metadata, arg_dict): # Extract detailed argument information extracted_args = extract_arg_info(arg_dict) - return {"launch_metadata_tritonparse": (grid, metadata, extracted_args)} + return {"launch_metadata_tritonparse": (grid, metadata._asdict(), extracted_args)} class JITHookImpl(JITHook): @@ -928,7 +931,7 @@ def __call__(self, metadata): ) if launch_metadata_tritonparse is not None: trace_data["grid"] = launch_metadata_tritonparse[0] - trace_data["metadata"] = launch_metadata_tritonparse[1] + trace_data["compilation_metadata"] = launch_metadata_tritonparse[1] trace_data["extracted_args"] = launch_metadata_tritonparse[ 2 ] # Now contains detailed arg info diff --git a/website/src/components/ArgumentViewer.tsx b/website/src/components/ArgumentViewer.tsx new file mode 100644 index 0000000..9925bb4 --- /dev/null +++ b/website/src/components/ArgumentViewer.tsx @@ -0,0 +1,134 @@ +import React, { useState } from 'react'; + +// Renders the value distribution (e.g., "16 (2 times, in launches: 1-2)") +const DistributionCell: React.FC<{ data: any }> = ({ data }) => { + if (!data) return null; + if (data.diff_type === 'summary') { + return {data.summary_text}; + } + if (data.diff_type === 'distribution' && data.values) { + return ( +
    + {data.values.map((item: any, index: number) => { + const launchRanges = item.launches + .map((r: any) => (r.start === r.end ? `${r.start + 1}` : `${r.start + 1}-${r.end + 1}`)) + .join(', '); + return ( +
  • + {JSON.stringify(item.value)} + ({item.count} times, in launches: {launchRanges}) +
  • + ); + })} +
+ ); + } + return {JSON.stringify(data)}; +}; + +// Renders a single row in the ArgumentViewer table +const ArgumentRow: React.FC<{ + argName: string; + argData: any; + isDiffViewer?: boolean; +}> = ({ argName, argData, isDiffViewer = false }) => { + // Case 1: This is a complex argument with internal differences + if (isDiffViewer && argData.diff_type === "argument_diff") { + const [isCollapsed, setIsCollapsed] = useState(false); + const { sames, diffs } = argData; + const hasSames = Object.keys(sames).length > 0; + const hasDiffs = Object.keys(diffs).length > 0; + + return ( +
+
setIsCollapsed(!isCollapsed)} + > +
{argName}
+
+ Complex argument with internal differences + {/* Dropdown arrow icon */} + +
+
+ {!isCollapsed && ( +
+ {hasSames && ( +
+
Unchanged Properties
+
+ {Object.entries(sames).map(([key, value]) => ( +
+ {key}: + {JSON.stringify(value)} +
+ ))} +
+
+ )} + {hasDiffs && ( +
+
Differing Properties
+
+ {Object.entries(diffs).map(([key, value]) => ( +
+ {key}: +
+
+ ))} +
+
+ )} +
+ )} +
+ ); + } + + // Case 2: This is a simple argument (in the "Sames" table) + return ( +
+
{argName}
+
{argData.type}
+
+ {typeof argData.value !== 'object' || argData.value === null ? + {String(argData.value)} : +
{JSON.stringify(argData, null, 2)}
+ } +
+
+ ); +}; + +// Main container component +const ArgumentViewer: React.FC<{ args: Record; isDiffViewer?: boolean; }> = ({ args, isDiffViewer = false }) => { + if (!args || Object.keys(args).length === 0) { + return
No arguments to display.
; + } + + // A "complex view" is needed if we are showing diffs and at least one of them is a complex argument_diff + const isComplexView = isDiffViewer && Object.values(args).some(arg => arg.diff_type === 'argument_diff'); + + return ( +
+ {/* Render header only for the simple, non-complex table view */} + {!isComplexView && ( +
+
Argument Name
+
Type
+
Value
+
+ )} + + {/* Rows */} +
+ {Object.entries(args).map(([argName, argData]) => ( + + ))} +
+
+ ); +}; + +export default ArgumentViewer; diff --git a/website/src/components/DiffViewer.tsx b/website/src/components/DiffViewer.tsx new file mode 100644 index 0000000..481598b --- /dev/null +++ b/website/src/components/DiffViewer.tsx @@ -0,0 +1,96 @@ +import ArgumentViewer from "./ArgumentViewer"; +import React from "react"; +import StackDiffViewer from "./StackDiffViewer"; + +interface DiffViewerProps { + diffs: any; +} + +const DiffViewer: React.FC = ({ diffs }) => { + if (!diffs || Object.keys(diffs).length === 0) { + return ( +

No differing fields detected.

+ ); + } + + // Separate different kinds of diffs + const extractedArgs = diffs.extracted_args; + const stackDiff = diffs.stack; + const otherDiffs = Object.fromEntries( + Object.entries(diffs).filter( + ([key]) => key !== "extracted_args" && key !== "stack" + ) + ); + + const renderSimpleDiff = (_key: string, data: any) => { + if (data.diff_type === "summary") { + return

{data.summary_text}

; + } + if (data.diff_type === "distribution") { + return ( +
    + {data.values.map((item: any, index: number) => { + const launchRanges = item.launches + .map((r: any) => + r.start === r.end + ? `${r.start + 1}` + : `${r.start + 1}-${r.end + 1}` + ) + .join(", "); + return ( +
  • + + {JSON.stringify(item.value)} + + + ({item.count} times, in launches: {launchRanges}) + +
  • + ); + })} +
+ ); + } + // Fallback for unexpected structures + return
{JSON.stringify(data, null, 2)}
; + }; + + return ( +
+ {extractedArgs && Object.keys(extractedArgs).length > 0 && ( +
+
+ Extracted Arguments +
+ +
+ )} + + {Object.keys(otherDiffs).length > 0 && ( +
+
+ Other Differing Fields +
+
+ {Object.entries(otherDiffs).map(([key, value]) => ( +
+ + {key} + +
{renderSimpleDiff(key, value)}
+
+ ))} +
+
+ )} + + {stackDiff && } + +
+ ); +}; + +export default DiffViewer; \ No newline at end of file diff --git a/website/src/components/StackDiffViewer.tsx b/website/src/components/StackDiffViewer.tsx new file mode 100644 index 0000000..06b06ee --- /dev/null +++ b/website/src/components/StackDiffViewer.tsx @@ -0,0 +1,82 @@ +import React, { useState } from 'react'; +import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter'; +import { oneLight } from 'react-syntax-highlighter/dist/esm/styles/prism'; + +// A single frame of a stack trace +const StackTraceFrame: React.FC<{ frame: any }> = ({ frame }) => ( +
+ {frame.filename}: + {frame.line} in{" "} + {frame.name} + {frame.line_code && ( +
+ + {frame.line_code} + +
+ )} +
+); + + +const StackDiffViewer: React.FC<{ stackDiff: any }> = ({ stackDiff }) => { + const [isCollapsed, setIsCollapsed] = useState(true); + + if (!stackDiff || stackDiff.diff_type !== 'distribution') { + return null; + } + + return ( +
+
setIsCollapsed(!isCollapsed)} + > + Stack Traces + {/* Dropdown arrow icon */} + + + +
+ {!isCollapsed && ( +
+ {stackDiff.values.map((item: any, index: number) => { + const launchRanges = item.launches + .map((r: any) => (r.start === r.end ? `${r.start + 1}` : `${r.start + 1}-${r.end + 1}`)) + .join(", "); + + return ( +
+

+ Variant seen {item.count} times (in launches: {launchRanges}) +

+
+ {Array.isArray(item.value) ? item.value.map((frame: any, frameIndex: number) => ( + + )) :

Invalid stack format

} +
+
+ ); + })} +
+ )} +
+ ); +}; + +export default StackDiffViewer; \ No newline at end of file diff --git a/website/src/pages/KernelOverview.tsx b/website/src/pages/KernelOverview.tsx index b78ed05..3708bdc 100644 --- a/website/src/pages/KernelOverview.tsx +++ b/website/src/pages/KernelOverview.tsx @@ -1,4 +1,6 @@ import React from "react"; +import ArgumentViewer from "../components/ArgumentViewer"; +import DiffViewer from "../components/DiffViewer"; import { ProcessedKernel } from "../utils/dataLoader"; interface KernelOverviewProps { @@ -8,6 +10,14 @@ interface KernelOverviewProps { onViewIR: (irType: string) => void; } +/** + * Determines if a metadata value is considered "long" and should be displayed at the end + */ +const isLongValue = (value: any): boolean => { + const formattedString = formatMetadataValue(value); + return formattedString.length > 50; +}; + /** * Formats a value for display in the metadata section * @param value - The value to format @@ -41,15 +51,15 @@ interface MetadataItemProps { const MetadataItem: React.FC = ({ label, value, - span = 1 + span = 1, }) => ( -
1 ? `col-span-${span}` : ''}`}> - - {label} - - - {value} - +
1 ? `col-span-${span}` : ""} ${ + span === 0 ? "col-span-full" : "" + }`} + > + {label} + {value}
); @@ -121,69 +131,152 @@ const KernelOverview: React.FC = ({ Compilation Metadata
-
- {/* Hash */} - {kernel.metadata.hash && ( - - )} - - {/* Target Info */} - {kernel.metadata.target && ( - <> - - - - - )} + {/* Short fields in responsive grid */} +
+ {/* All short metadata fields */} + {Object.entries(kernel.metadata) + .filter(([_key, value]) => !isLongValue(value)) + .map(([key, value]) => { + return ( + + word.charAt(0).toUpperCase() + word.slice(1) + ) + .join(" ")} + value={formatMetadataValue(value)} + /> + ); + })} +
- {/* Execution Configuration */} - - - + {/* Long fields in separate section within same container */} + {Object.entries(kernel.metadata).filter(([_key, value]) => + isLongValue(value) + ).length > 0 && ( +
+ {Object.entries(kernel.metadata) + .filter(([_key, value]) => isLongValue(value)) + .map(([key, value]) => ( +
+ + {key + .split("_") + .map( + (word) => + word.charAt(0).toUpperCase() + word.slice(1) + ) + .join(" ")} + + + {formatMetadataValue(value)} + +
+ ))} +
+ )} +
+
+ )} - {/* Cluster Dimensions */} - {kernel.metadata.cluster_dims && ( - - )} + {/* Launch Analysis Section */} + {kernel.launchDiff && ( +
+

+ Launch Analysis +

+
+

+ Total Launches:{" "} + {kernel.launchDiff.total_launches} +

- {/* Other Metadata */} - - + {/* Launch Index Map */} + {kernel.launchDiff.launch_index_map && ( +
+

+ Launch Locations in Original Trace{" "} + + (1-based line numbers) + +

+
+ {kernel.launchDiff.launch_index_map + .map((r: any) => + r.start === r.end + ? `${r.start}` + : `${r.start}-${r.end}` + ) + .join(", ")} +
+
+ )} - {/* Supported FP8 Types */} - {kernel.metadata.supported_fp8_dtypes && - kernel.metadata.supported_fp8_dtypes.length > 0 && ( - - )} + {/* Unchanged Fields */} + {kernel.launchDiff.sames && Object.keys(kernel.launchDiff.sames).length > 0 && ( +
+

+ Unchanged Launch Arguments +

+ +
+ )} - {/* Additional metadata fields */} - {Object.entries(kernel.metadata) - .filter( + {(() => { + const otherSames = Object.fromEntries( + Object.entries(kernel.launchDiff.sames).filter( ([key]) => - ![ - "hash", - "target", - "num_warps", - "num_ctas", - "num_stages", - "cluster_dims", - "enable_fp_fusion", - "launch_cooperative_grid", - "supported_fp8_dtypes", - ].includes(key) + key !== "compilation_metadata" && + key !== "extracted_args" && + key !== "event_type" ) - .map(([key, value]) => ( - word.charAt(0).toUpperCase() + word.slice(1)).join(" ")} value={formatMetadataValue(value)} /> - ))} + ); + + if (Object.keys(otherSames).length > 0) { + return ( +
+

+ Other Unchanged Fields +

+
+ {Object.entries(otherSames).map(([key, value]) => ( + + word.charAt(0).toUpperCase() + word.slice(1) + ) + .join(" ")} + value={formatMetadataValue(value)} + /> + ))} +
+
+ ); + } + return null; + })()} + + {/* Differing Fields */} +
+

+ Differing Fields +

+
)} - + {/* Stack Trace */}

- Stack Trace + Compilation Stack Trace

{kernel.stack.map((entry, index) => ( diff --git a/website/src/utils/dataLoader.ts b/website/src/utils/dataLoader.ts index ed8e84c..84f81d0 100644 --- a/website/src/utils/dataLoader.ts +++ b/website/src/utils/dataLoader.ts @@ -99,6 +99,12 @@ export interface LogEntry { source_mappings?: Record>; // Alternative field name for source_mapping python_source?: PythonSourceCodeInfo; }; + // Fields for launch_diff event type + hash?: string; + name?: string; + total_launches?: number; + diffs?: any; + sames?: any; } /** @@ -113,6 +119,7 @@ export interface ProcessedKernel { sourceMappings?: Record>; // Source mappings for each IR file pythonSourceInfo?: PythonSourceCodeInfo; // Python source code information metadata?: KernelMetadata; // Compilation metadata + launchDiff?: any; // Aggregated launch event differences } /** @@ -121,6 +128,7 @@ export interface ProcessedKernel { * @returns Array of LogEntry objects */ export function parseLogData(textData: string): LogEntry[] { + console.log("Starting to parse NDJSON data..."); if (typeof textData !== 'string') { throw new Error("Input must be a string in NDJSON format"); } @@ -142,9 +150,11 @@ export function parseLogData(textData: string): LogEntry[] { } if (entries.length === 0) { + console.error("No valid JSON entries found in NDJSON data"); throw new Error("No valid JSON entries found in NDJSON data"); } + console.log(`Successfully parsed ${entries.length} log entries.`); return entries; } catch (error) { console.error("Error parsing NDJSON data:", error); @@ -274,12 +284,19 @@ export function loadLogDataFromFile(file: File): Promise { * @returns Array of processed kernel objects ready for display */ export function processKernelData(logEntries: LogEntry[]): ProcessedKernel[] { - const kernels: ProcessedKernel[] = []; - for (let i = 0; i < logEntries.length; i++) { - const entry = logEntries[i]; - // Check for kernel events by event_type + console.log("Processing kernel data... Total entries:", logEntries.length); + const kernelsByHash: Map = new Map(); + + // First pass: process all compilation events + for (const entry of logEntries) { if (entry.event_type === "compilation" && entry.payload) { - // Ensure payload has file_path and file_content + const hash = entry.payload.metadata?.hash; + if (!hash) { + console.warn("Compilation event missing hash", entry); + continue; + } + console.log(`Processing compilation event for hash: ${hash}`) + if (!entry.payload.file_path || !entry.payload.file_content) { console.warn( "Kernel event missing file_path or file_content", @@ -287,10 +304,9 @@ export function processKernelData(logEntries: LogEntry[]): ProcessedKernel[] { ); continue; } - // Extract kernel name from IR filename + const irFileNames = Object.keys(entry.payload.file_path); let kernelName = "unknown_kernel"; - // Use first IR file name to determine kernel name if (irFileNames.length > 0) { const fileName = irFileNames[0]; const nameParts = fileName.split("."); @@ -300,19 +316,9 @@ export function processKernelData(logEntries: LogEntry[]): ProcessedKernel[] { : fileName; } - // Extract source mapping information from payload if available - let sourceMappings: Record< - string, - Record - > = {}; + const sourceMappings = entry.payload.source_mappings || {}; - if (entry.payload.source_mappings) { - // Use source mappings from the trace file - sourceMappings = entry.payload.source_mappings; - } - - // Create processed kernel object and add to results - kernels.push({ + const newKernel: ProcessedKernel = { name: kernelName, sourceFiles: entry.stack?.map(entry => typeof entry.filename === 'string' ? entry.filename : @@ -324,9 +330,32 @@ export function processKernelData(logEntries: LogEntry[]): ProcessedKernel[] { sourceMappings, pythonSourceInfo: entry.payload.python_source, metadata: entry.payload.metadata, - }); + }; + kernelsByHash.set(hash, newKernel); + console.log(`Stored kernel ${kernelName} with hash ${hash} into map.`); + } + } + console.log(`Finished first pass. Total kernels processed: ${kernelsByHash.size}`); + + // Second pass: attach launch_diff events + console.log("Starting second pass to attach launch_diff events..."); + for (const entry of logEntries) { + if (entry.event_type === "launch_diff") { // No payload for launch_diff + console.log("Found a launch_diff event:", entry); + const hash = entry.hash; + console.log(`launch_diff event hash: ${hash}`); + if (hash && kernelsByHash.has(hash)) { + const kernel = kernelsByHash.get(hash)!; + console.log(`Found matching kernel for hash ${hash}. Attaching launch_diff.`); + kernel.launchDiff = entry; // Attach the entire event object + } else { + console.warn(`Could not find matching kernel for launch_diff hash: ${hash}`); + } } } - return kernels; + + const finalKernels = Array.from(kernelsByHash.values()); + console.log("Finished processing. Final kernel objects:", finalKernels); + return finalKernels; }